In [2]:
from config import AGD20K_PATH 
from PIL import Image
import io
import base64

# 1. 메모리 버퍼(BytesIO) 생성
buffered = io.BytesIO()


In [66]:
import api_key
import openai
client_gpt = openai.OpenAI()
gpt_model_name="gpt-5.1-2025-11-13"
client_gemini = openai.OpenAI(
    api_key=api_key.gemini_api_key,
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)
gemini_model_name = "gemini-2.5-pro"

In [48]:
question = "hello. which model are you?"
messages = [{'role': 'user', 'content': question}]
response_gpt = client_gpt.chat.completions.create(
    model="gpt-5.1-2025-11-13",
    messages=messages,
)
response_gpt.choices[0].message.content

'I’m an OpenAI assistant based on the GPT‑4.1 family of models, with image understanding enabled.'

In [50]:
question = "hello. which model are you?"
messages = [{'role': 'user', 'content': question}]
response_gemini = client_gemini.chat.completions.create(
    model="gemini-2.5-pro",
    messages=messages,
)
response_gemini.choices[0].message.content

'I am a large language model, trained by Google.'

In [1]:
import re
import pandas as pd
import ast  # 문자열로 된 리스트를 실제 리스트로 변환하기 위해 사용

# 파일 경로 설정
file_path = '/home/bongo/porter_notebook/research/qwen3/32B_ego_exo_relative_prompt5/ego_exo_prompt5_relative.log'

# 데이터를 담을 리스트
data_list = []

# 정규표현식 패턴 컴파일
# 패턴 1: Action, Object, image_name 추출
# 구조: Action : {값}, Object : {값} image_name : {값}
pattern_meta = re.compile(r"Action\s*:\s*(.+?),\s*Object\s*:\s*(.+?)\s+image_name\s*:\s*(.+)")

# 패턴 2: parsed dots 추출
# 구조: parsed dots!!! : {리스트형태}
pattern_dots = re.compile(r"parsed dots!!!\s*:\s*(.+)")

# 임시 저장용 딕셔너리
current_entry = {}


In [25]:

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        
        # 1. Action, Object, Filename 라인 찾기
        match_meta = pattern_meta.search(line)
        if match_meta:
            # 새로운 항목 시작
            current_entry = {} 
            current_entry['Action'] = match_meta.group(1).strip()
            current_entry['Object'] = match_meta.group(2).strip()
            current_entry['filename'] = match_meta.group(3).strip()
            continue
        
        # 2. Parsed dots 라인 찾기
        match_dots = pattern_dots.search(line)
        if match_dots and 'Action' in current_entry:
            dots_str = match_dots.group(1).strip()
            try:
                # 문자열 "[[1,2], ...]"을 실제 리스트 객체로 변환
                current_entry['parsed_dots'] = ast.literal_eval(dots_str)
            except:
                # 변환 실패 시 문자열 그대로 저장
                current_entry['parsed_dots'] = dots_str
            
            # 필요한 정보가 다 모였으므로 리스트에 추가
            data_list.append(current_entry)
            current_entry = {} # 초기화

# 데이터프레임 생성
df = pd.DataFrame(data_list)
df.columns = ['action','object','filename','dots']
df_fin = df.loc[df[['action','object','filename']].drop_duplicates().index].reset_index(drop=True)
# df = df.drop_duplicates()#.reset_index(drop=True)

df_fin

Unnamed: 0,action,object,filename,dots
0,jump,skis,skis_002829.jpg,"[[350, 200], [350, 600], [350, 900]]"
1,jump,skateboard,skateboard_002387.jpg,"[[167, 458], [500, 480], [833, 430]]"
2,jump,surfboard,surfboard_000658.jpg,"[[250, 300], [500, 500], [750, 700]]"
3,jump,snowboard,snowboard_001704.jpg,"[[500, 450], [300, 550], [700, 550]]"
4,peel,carrot,carrot_003707.jpg,"[[210, 345], [480, 390], [770, 460]]"
...,...,...,...,...
116,eat,apple,apple_001541.jpg,"[[500, 460], [350, 400], [700, 450]]"
117,swing,baseball_bat,baseball_bat_001882.jpg,"[[500, 500], [200, 750], [700, 250]]"
118,swing,tennis_racket,tennis_racket_003066.jpg,"[[500, 300], [500, 650], [500, 900]]"
119,swing,golf_clubs,golf_clubs_001992.jpg,"[[175, 48], [165, 495], [135, 950]]"


In [28]:
df_fin[(df_fin['object']=='bicycle')&(df_fin['action']=='sit_on')]

Unnamed: 0,action,object,filename,dots
19,sit_on,bicycle,bicycle_002100.jpg,"[[385, 292], [415, 362], [450, 510]]"


In [94]:
file_name_real

'/home/DATA/AGD20K/Seen/testset/egocentric/hold/baseball_bat/baseball_bat_002547.jpg'

In [125]:

from io import BytesIO
def make_input_image(file_name_real):
    # 1. 이미지 열기 및 리사이즈
    with Image.open(file_name_real) as img:
        img = img.convert("RGB")
        resized_image = img.resize((1000, 1000))
        
        # 2. 함수 내부에서 버퍼 생성 (with 구문 사용 추천 X -> getvalue 후엔 자동 GC됨)
        buffered = BytesIO()
        # 3. 버퍼에 저장 (메모리에 JPEG 생성)
        resized_image.save(buffered, format="JPEG")
        
        # 4. 바로 인코딩 후 리턴 (한 줄로 처리)
        return base64.b64encode(buffered.getvalue()).decode('utf-8')

# 2. 강력한 프롬프트 작성
system_prompt = """
You are an expert in Visual Affordance Grounding. 
Your task is to evaluate whether a specific pixel coordinate on an image is a valid region for a human to perform a specific action on an object.
"""

def input_prompt_gpt(action, object_name, dot):
    return f"""
You are an affordance-evaluation model.

### Inputs
- Action: "{action}"
- Object: "{object_name}"
- QueryPoint: ({dot[0]}, {dot[1]})
- ImageResolution: 1000x1000

### Task
Determine whether the QueryPoint lies **inside the affordance region** for performing the specified action on the object.
- The affordance region is the physical part of the object that enables the action.
- Ignore all irrelevant areas (background, non-functional parts, etc.).

### Requirements
Respond **only** in the following JSON structure:

{{
  "result": "Pass" or "Fail",
  "reason": "<brief explanation>"
}}
"""



def input_prompt(action, object_name, dot):
    return f"""
    Analyze the provided image with the following details:

    1. **Target Action**: {action}
    2. **Target Object**: {object_name}
    3. **Query Point**: ({dot[0]},{dot[1]}) 
    4. **Image Resolution**: 1000x1000

    **Task**:
    Evaluate if the "Query Point" falls within the **affordance region** specific to the "{action}" on the "{object_name}". 
    (e.g., If action is 'jump' on 'skateboard', the point should be on the deck where feet act, not on the wheels or background.)
    **Output Format**:
    Provide the result in JSON format only:
    {{
        "result": <Pass or Fail>,
        "reason": "<short explanation why>"
    }}
    """

import json
import re



def parse_llm_json(text):
    """
    마크다운 코드 블록(```json ... ```)을 제거하고 JSON으로 변환하는 함수
    """
    try:
        # 1. 정규표현식으로 ```json 과 ``` 사이의 내용만 추출
        # re.DOTALL: 줄바꿈(\n)도 포함해서 찾기 위함
        match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
        
        if match:
            json_str = match.group(1)  # 코드 블록 안의 내용만 가져옴
        else:
            json_str = text  # 코드 블록이 없으면 원본 그대로 사용 시도
            
        # 2. JSON 파싱
        return json.loads(json_str)
        
    except json.JSONDecodeError as e:
        print(f"JSON 파싱 실패: {e}")
        return None


In [126]:
for index, row in df_fin.iterrows():
    object_name = row['object']
    action = row['action']
    filename = row['filename']
    dot_list =  row['dots']
    file_name_real = f"{AGD20K_PATH}/Seen/testset/egocentric/{action}/{object_name}/{filename}"
    if (object_name=='cup')&(action =='sip'):
        print(index,object_name,action,filename)
        image_base64 = make_input_image(file_name_real)
        dot_res_list = []
        dot_reason_list = []
        for dot in dot_list:

            question = "tell me about the image"
            messages = [
            {"role": "system", "content": system_prompt},
            {
            "role": "user",
            "content": [
                {"type": "text", "text": input_prompt_gpt(action, object_name, dot)},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
                        ]
            }]
            # 2. 추론 (OpenAI API 호출)
            response = client_gpt.chat.completions.create(
            model=gpt_model_name,
            messages=messages,
            )
            result = response.choices[0].message.content
            result
            print(f"{dot} : {result}")
            llM_result_json = parse_llm_json(result)
            dot_res_list.append(llM_result_json['result'])
            dot_reason_list.append(llM_result_json['reason'])

        print(a)
    else:
        continue
    

35 cup sip cup_001864.jpg
[490, 490] : {
  "result": "Fail",
  "reason": "The point lies on the coffee beans background, not on the rim or drinkable opening of the cup required for sipping."
}
[400, 400] : {
  "result": "Fail",
  "reason": "The point (400,400) lies on the coffee beans background, not on the rim or opening of the cup that affords sipping."
}
[580, 400] : {
  "result": "Fail",
  "reason": "The point lies on the surrounding coffee beans, not on the cup rim or opening area used for sipping."
}


NameError: name 'a' is not defined

In [None]:
llM_result_json['score']

{'score': 'Pass',
 'reason': 'The query point is located on the rim of the cup, which is the correct area for a person to drink from.'}

In [114]:
result_row = []
reason_row = []
for index, row in df_fin.iterrows():
    object_name = row['object']
    action = row['action']
    filename = row['filename']
    dot_list =  row['dots']
    file_name_real = f"{AGD20K_PATH}/Seen/testset/egocentric/{action}/{object_name}/{filename}"
    # if (object_name=='cup')&(action =='drink_with'):
    print(index,object_name,action,filename)
    image_base64 = make_input_image(file_name_real)
    dot_res_list = []
    dot_reason_list = []

    for dot in dot_list:

        question = "tell me about the image"
        messages = [
        {"role": "system", "content": system_prompt},
        {
        "role": "user",
        "content": [
            {"type": "text", "text": input_prompt(action, object_name, dot)},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
                    ]
        }]
        # 2. 추론 (OpenAI API 호출)
        response = client_gemini.chat.completions.create(
        model= 'gemini-2.5-flash-lite' , #gemini_model_name,
        messages=messages,
        )
        result = response.choices[0].message.content
        result
        print(f"{dot} : {result}")
        llM_result_json = parse_llm_json(result)
        dot_res_list.append(llM_result_json['score'])
        dot_reason_list.append(llM_result_json['reason'])
    result_row.append(dot_res_list)
    reason_row.append(dot_reason_list)

    

0 skis jump skis_002829.jpg
[350, 200] : ```json
{
  "score": "Fail",
  "reason": "The query point (350, 200) is located on the upper section of the ski, specifically above where a skier's boots would be placed. The action 'jump' on skis would involve the entire ski for propulsion and landing, but the primary interaction area for a jump maneuver is typically around the binding area."
}
```
[350, 600] : ```json
{
    "score": "Pass",
    "reason": "The point (350,600) falls on the lower portion of the left ski, which is a valid area for a person's feet to be positioned when performing a jump."
}
```
[350, 900] : ```json
{
    "score": "Fail",
    "reason": "The query point (350,900) is located on the bottom tip of the right ski, which is not a suitable area for performing a jump action on skis. The jump action typically involves the entire length of the skis, particularly the bindings and the middle section where the feet are placed."
}
```
1 skateboard jump skateboard_002387.jpg
[167, 

KeyboardInterrupt: 

In [111]:
gemini_model_name

'gemini-2.5-pro'

In [2]:
import pandas as pd

In [3]:
df_fin2 = pd.read_pickle('test_verify.pkl')
df_fin2

NameError: name 'pd' is not defined

'```json\n{\n    "score": 100,\n    "reason": "The query point is located directly on the saddle (seat) of the bicycle, which is the specific part designed for the action \'sit_on\'."\n}\n```'

In [86]:
question = "tell me about the image"
messages = [
    {"role": "system", "content": system_prompt},
    {
    "role": "user",
    "content": [
        {"type": "text", "text": user_prompt},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_string}"}}
                ]
    }]
# 2. 추론 (OpenAI API 호출)
response = client_gemini.chat.completions.create(
    model=gemini_model_name,
    messages=messages,
)
result = response.choices[0].message.content
result


'```json\n{\n    "score": 48,\n    "reason": "The query point is on the main frame of the bicycle. While this is part of the target object, the correct affordance region for the action \'sit_on\' is the saddle/seat."\n}\n```'

'{\n  "score": 15,\n  "reason": "The point lies on the pavement near the bicycle, not on the bicycle’s seat or any other part suitable for sitting on."\n}'

'The image shows a bright yellow bicycle parked on a wide sidewalk next to a road. The bike is slightly tilted, resting on its kickstand, and has a front basket and a rear fender.  \n\nOn the left side of the image there is a green park area with neatly trimmed grass and young trees supported by stakes. On the right is a multi-lane road with a few vehicles in the distance. The sky is overcast with gray clouds, giving the scene a calm, quiet, almost empty-street feel.'