In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
from pathlib import Path
from scipy.stats import pearsonr
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation

clip_processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
clip_model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from config import AGD20K_PATH, model_name

from VLM_model_dot_relative import MetricsTracker
from file_managing import (
    load_selected_samples,
    get_actual_path,
    get_gt_path,
    load_ground_truth,
    prompt_dict_obj,
    get_clipseg_heatmap,
    calculate_metrics,
    prompt_dict_obj
)

def min_max_normalize(arr):
    denom = arr.max() - arr.min()
    if denom == 0:
        return np.zeros_like(arr)
    return (arr - arr.min()) / (denom + 1e-8)



from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
model_name= f"Qwen/Qwen3-VL-32B-Instruct"
processor = AutoProcessor.from_pretrained(model_name)
tok = processor.tokenizer

AGD20K_PATH

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


'/home/DATA/AGD20K'

In [3]:
import numpy as np

def check_heatmap_containment(heatmap_top, heatmap_obj, threshold=0.15, containment_ratio=0.8):
    """
    Args:
        containment_ratio (float): Top 영역의 몇 % 이상이 Obj와 겹쳐야 포함으로 볼 것인지 (기본 0.9 = 90%)
    """
    
    # 1. 텐서인 경우 numpy 변환
    if hasattr(heatmap_top, 'cpu'):
        heatmap_top = heatmap_top.detach().cpu().numpy()
    if hasattr(heatmap_obj, 'cpu'):
        heatmap_obj = heatmap_obj.detach().cpu().numpy()

    # 2. 이진 마스크 생성
    mask_top = heatmap_top > threshold
    mask_obj = heatmap_obj > threshold

    # 3. 면적 계산
    area_top = np.sum(mask_top)
    area_obj = np.sum(mask_obj)

    # 예외 처리: Top 히트맵이 아예 활성화되지 않은 경우 (면적 0)
    if area_top == 0:
        return False

    # 조건 1: Top의 면적이 Object 면적보다 작은가?
    is_smaller = area_top < area_obj
    
    # 4. 포함 관계 확인 (수정된 부분)
    # 교집합(Intersection) 영역 계산
    intersection = np.logical_and(mask_top, mask_obj)
    intersection_area = np.sum(intersection)

    # [수정됨] 교집합 면적이 Top 전체 면적의 90% 이상인지 확인
    # (intersection_area / area_top) >= 0.9 와 동일한 수식입니다.
    is_inside = intersection_area >= (area_top * containment_ratio)

    # 디버깅용: 실제 겹치는 비율 확인
    # print(f"Overlap Ratio: {intersection_area / area_top:.2f}")

    return is_smaller and is_inside

In [7]:
metrics_tracker_alloutput = MetricsTracker(name="all_output")
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

# 저장할 디렉토리 생성
Layername = "selection_hold_axe"
output_dir = f"./exo_{Layername}"  # 디렉토리 이름 변경 (구분 위해)s
os.makedirs(output_dir, exist_ok=True)

# Contrastive Subtraction 강도 설정 (0.5 ~ 1.0 추천)
patch_size = 24
POS_ALPHA = 0
results_list = []
pos_map = np.zeros((patch_size, patch_size), dtype=np.float32)

pkl_path = f"output_exo_selection/exo_sample_32B_hold_axe.pkl"
if not os.path.exists(pkl_path):
    raise FileNotFoundError(f"File not found: {pkl_path}")
    
df_output = pd.read_pickle(pkl_path)

for idx, row in df_output.iterrows():
    object_name = row['object']
    action = row['action']
    filename = row['filename']
    output_description = row['output_sentence']
    output_attentions = row['output_attentions']
    PLSP_name = prompt_dict_obj[action][object_name]
    exo_name =  os.path.basename(row['exo_filename'])
    print(f"exo_name : {exo_name}")
    exo_base_path = Path(f"{AGD20K_PATH}/Seen/trainset/exocentric/{action}/{object_name}/{exo_name}")

    sum_heatmap = np.zeros((patch_size, patch_size), dtype=np.float32)
    
    exo_img = cv2.imread(str(exo_base_path))
    exo_img = cv2.cvtColor(exo_img, cv2.COLOR_BGR2RGB)
    
    
    file_name_real = f"{AGD20K_PATH}/Seen/testset/egocentric/{action}/{object_name}/{filename}"
    gt_path = f"{AGD20K_PATH}/Seen/testset/GT/{action}/{object_name}/{filename.split('.')[0]}.png"
        
    # 이미지 로드
    if not os.path.exists(file_name_real):
        print(f"Image not found: {file_name_real}")
        continue

    orig_img = cv2.imread(file_name_real)
    orig_img = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
    h, w, _ = orig_img.shape

    print(f"[{idx}] Processing: {action} - {object_name}")

    # --- 2. [변경] Contrastive Attention Map 계산 ---
    token_scores = []
    token_idx = 0
    for token in output_attentions:
        # 토큰별 히트맵 초기화
        token_heatmap = np.zeros((patch_size, patch_size), dtype=np.float32)
        token_head_count = 0
        
        attention_value = token['attentions']
        decoded_str = token['token_str'] # 디버깅용

        for each_attention in attention_value:
            layer = each_attention['layer']
            head = each_attention['head']
            
            # # 사용자가 설정한 특정 레이어 필터링 (여기선 Layer 0 유지)
            # if each_attention['layer'] != 0:
            if 1==1: # (layer <=45) and (layer >= 20): #1==1: # (layer == 26) : #and( head ==20)) : #or ((layer == 24) and( head ==31)):
                sum_heatmap += each_attention['heatmap']
                token_heatmap += each_attention['heatmap']
                token_head_count += 1

        
        # 해당 레이어의 헤드가 하나도 없으면 스킵
        if token_head_count == 0:
            continue

        # Visual Dependency Score (S_img) 계산: 맵의 총합
        s_img = token_heatmap.sum()

        # 리스트에 저장
        token_scores.append({
            "token": decoded_str,
            "token_idx" : token_idx,
            "score": s_img,
            "heatmap": token_heatmap, 
            "count": token_head_count
        })
        token_idx +=1
    # 예외 처리: 토큰이 없을 경우
    if len(token_scores) == 0:
        print("No valid tokens found.")
        continue

    # 정렬 (Score 기준 오름차순)
    sorted_tokens = sorted(token_scores, key=lambda x: x['score'])
    
    # Top 5 (Signal) & Bottom 5 (Noise) 선정
    # 토큰 개수가 10개 미만일 경우 처리
    num_select = min(1, len(sorted_tokens) // 2)
    if num_select < 1: num_select = 1 # 최소 1개

    bottom_tokens = sorted_tokens[:num_select]       # Noise (기능어, 배경 등)
    top_tokens = sorted_tokens[-num_select:][::-1]   # Signal (명사, 핵심어)

    top_token_idx  = top_tokens[-1]['token_idx']
    top_token_text  = top_tokens[-1]['token']

    following_token_idx = top_token_idx + 1
    following_token = None
    for item in token_scores:
        if item['token_idx'] == following_token_idx:
            following_token = item
            break
    following_text = following_token['token']

    clip_object_heatmap = get_clipseg_heatmap(
        file_name_real,
        clip_model,
        clip_processor,
        object_name,
    )

    clip_top_heatmap = get_clipseg_heatmap(
        file_name_real,
        clip_model,
        clip_processor,
        top_token_text + ' ' + following_text,
    )
    pos_map = np.sum([t['heatmap'] for t in top_tokens], axis=0)
    pos_map /= len(top_tokens)
    
    # Noise Map (Negative) 평균
    neg_map = np.sum([t['heatmap'] for t in bottom_tokens], axis=0)
    neg_map /= len(bottom_tokens)



    # 정규화 (스케일 맞추기 위해 0~1로 변환 후 뺄셈 진행)
    if pos_map.max() > 0: pos_map /= pos_map.max()
    if neg_map.max() > 0: neg_map /= neg_map.max()

    # ✨ Contrastive Subtraction (Signal - alpha * Noise)
    CONTRASTIVE_ALPHA = 0
    contrastive_heatmap = (pos_map) - (CONTRASTIVE_ALPHA * neg_map)
    # contrastive_heatmap = sum_heatmap / (token_head_count + 1e-8)
    # --- 3. 정규화 및 후처리 (기존 코드 흐름 연결) ---
    # Contrastive Map을 avg_norm 변수로 사용 (0~1 정규화)
    h_min, h_max = contrastive_heatmap.min(), contrastive_heatmap.max()

    
    avg_norm = (contrastive_heatmap - h_min) / (h_max - h_min + 1e-8)

    if check_heatmap_containment(clip_top_heatmap,clip_object_heatmap):
        clip_heatmap = clip_top_heatmap
        clipseg_input_text = top_token_text+ ' ' + following_text
        print(f"Selected CLIP input : {top_token_text}")
        # Signal Map (Positive) 평균



    else:
        clip_heatmap = clip_object_heatmap
        clipseg_input_text = object_name
        # h_min, h_max = token_heatmap.min(), token_heatmap.max()
        # avg_norm = (token_heatmap - h_min) / (h_max - h_min + 1e-8)

        print(f"Selected CLIP input : {object_name}")

    clip_heatmap_resized = cv2.resize(clip_heatmap, (patch_size, patch_size), interpolation=cv2.INTER_LINEAR)
    clip_binary_mask = (clip_heatmap_resized > 0.15).astype(np.float32) # 필요시 사용


    # CLIPSeg와 Hadamard Product
    avg_norm_cliped = avg_norm * clip_heatmap_resized
    gamma =  0.75  # 0
    avg_norm_cliped = np.power(avg_norm_cliped, gamma)
    # 리사이즈 및 블러링
    avg_norm_cliped_rescaled = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)
    
    sig = min(w, h) * 0.05
    k_val = int(sig * 3) * 2 + 1 
    kernel_size = (k_val, k_val)

    # 블러 적용
    blur_map = cv2.GaussianBlur(avg_norm_cliped_rescaled, kernel_size, sig)

    # 블러 후 다시 정규화
    blur_map = min_max_normalize(blur_map) # 함수가 정의되어 있다고 가정
    avg_norm_cliped_blur = blur_map
    
    # 시각화를 위해 31x31 맵도 원본 크기로 리사이즈
    avg_norm_resized_vis = cv2.resize(avg_norm, (w, h), interpolation=cv2.INTER_LINEAR)
    clip_vis = cv2.resize(clip_heatmap_resized, (w, h), interpolation=cv2.INTER_NEAREST)

    # --- 4. GT 평가 및 메트릭 계산 ---
    gt_map = load_ground_truth(gt_path) # 함수 정의 가정
    if gt_map is not None:
        metrics_dino = calculate_metrics(avg_norm_cliped_blur, gt_map) # 함수 정의 가정
        metrics_tracker_alloutput.update(metrics_dino) # 객체 정의 가정
        
        # 메트릭 텍스트
        metrics_text = f"[{object_name} {action}] KLD: {metrics_dino['KLD']:.4f} | SIM: {metrics_dino['SIM']:.4f} | NSS: {metrics_dino['NSS']:.4f}"
        metrics_tracker_alloutput.print_metrics(metrics_dino, filename)
    else:
        print("NO GT!!!")
        metrics_text = "No GT Available"
        continue
    
    results_list.append({
        'object': object_name,
        'action': action,
        'filename': filename,
        'output_sentence': output_description,
        'top_token_text': top_token_text,
        'following_text': following_text,
        'clip_input': clipseg_input_text,
        'KLD': metrics_dino['KLD'],
        'SIM': metrics_dino['SIM'],
        'NSS': metrics_dino['NSS']
    })

    # --- 5. 시각화 ---
    fig, axes = plt.subplots(1, 7, figsize=(24, 5)) # 사이즈 살짝 조정
    
    # Signal 단어와 Noise 단어 표시 (제목용)
    top_words = ",".join([f"'{t['token'].strip()}'" for t in top_tokens[:5]])
    
    main_title = f"Obj: {object_name} | Act: {action} |{metrics_text}\nTop Tokens: [{top_words}({top_token_idx } ), clipseg input : {top_token_text} {following_text}] \n Whole answer : {output_description}"
    fig.suptitle(main_title, fontsize=14, fontweight='bold', y=0.98)

    # (1) 원본 이미지
    axes[0].imshow(orig_img)
    axes[0].set_title(f"Original\n({object_name})")
    axes[0].axis('off')

    # (2) Contrastive Attention (Pos - Neg)
    im1 = axes[1].imshow(avg_norm_resized_vis, cmap='jet', interpolation='bilinear')
    axes[1].set_title(f"Attention Map {Layername}")
    axes[1].axis('off')
    plt.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04)

    # (3) CLIPSeg Result
    axes[2].imshow(clip_vis, cmap='gray')
    axes[2].set_title(f"CLIPSeg {clipseg_input_text}")
    axes[2].axis('off')

    # (4) Hadamard (Contrastive x CLIPSeg)
    # 리사이즈하여 시각화
    hadamard_vis = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)
    im3 = axes[3].imshow(hadamard_vis, cmap='jet', interpolation='bilinear')
    axes[3].set_title("Hadamard\n(Contrastive x CLIP)")
    axes[3].axis('off')
    plt.colorbar(im3, ax=axes[3], fraction=0.046, pad=0.04)

    # (5) Final Blurred Result
    im4 = axes[4].imshow(avg_norm_cliped_blur, cmap='jet', interpolation='bilinear')
    axes[4].set_title("Final Blurred")
    axes[4].axis('off')
    plt.colorbar(im4, ax=axes[4], fraction=0.046, pad=0.04)

    # (6) Ground Truth
    axes[5].imshow(gt_map, cmap='gray') # GT는 보통 binary 혹은 gray
    axes[5].set_title("Ground Truth")
    axes[5].axis('off')

    # (6) exo Image
    axes[6].imshow(exo_img, cmap='gray') # GT는 보통 binary 혹은 gray
    axes[6].set_title("Ground Truth")
    axes[6].axis('off')


    # 파일 저장
    save_path = os.path.join(output_dir, f"exo_{object_name}_{action}_{filename.split('.')[0]}_{exo_name.split('.')[0]}.png")
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight', dpi=150)
    plt.close(fig)

df_analy = pd.DataFrame(results_list)
df_analy.to_pickle("exo_selection_hold_axe.pkl")
        # print(f"clipseg_input_text : {clipseg_input_text}")

exo_name : hold_axe_000384.jpg
[0] Processing: hold - axe


  return self.preprocess(images, **kwargs)


Selected CLIP input :  red

Metrics for all_output axe_001552.jpg:
 all_output Current - KLD: 0.2122 | SIM: 0.7477 | NSS: 2.5619

Cumulative all_output  Averages over 1 samples:
Average - KLD: 0.2122 | SIM: 0.7477 | NSS: 2.5619

exo_name : hold_axe_000856.jpg
[1] Processing: hold - axe
Selected CLIP input :  red

Metrics for all_output axe_001552.jpg:
 all_output Current - KLD: 0.2113 | SIM: 0.7490 | NSS: 2.5699

Cumulative all_output  Averages over 2 samples:
Average - KLD: 0.2118 | SIM: 0.7484 | NSS: 2.5659

exo_name : hold_axe_001123.jpg
[2] Processing: hold - axe
Selected CLIP input :  red

Metrics for all_output axe_001552.jpg:
 all_output Current - KLD: 0.2118 | SIM: 0.7482 | NSS: 2.5682

Cumulative all_output  Averages over 3 samples:
Average - KLD: 0.2118 | SIM: 0.7483 | NSS: 2.5667

exo_name : hold_axe_003505.jpg
[3] Processing: hold - axe
Selected CLIP input :  red

Metrics for all_output axe_001552.jpg:
 all_output Current - KLD: 0.2136 | SIM: 0.7460 | NSS: 2.5551

Cumulativ

In [5]:
df_analy.to_pickle("exo_selection_hold_badminton_racket.pkl")

In [8]:
df_analy.sort_values(["NSS"])

Unnamed: 0,object,action,filename,output_sentence,top_token_text,following_text,clip_input,KLD,SIM,NSS
11,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",black,and,black and,0.671341,0.543523,1.9507
10,axe,hold,axe_001552.jpg,"Based on the first image, the part of the axe ...",red,and,red and,0.223632,0.739141,2.538331
15,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.215088,0.744543,2.551214
5,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.214253,0.745661,2.554565
6,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.212254,0.746921,2.555052
3,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.213585,0.746024,2.555101
13,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.21325,0.745724,2.555855
0,axe,hold,axe_001552.jpg,"Based on the first image, the handle (specific...",red,and,red and,0.212207,0.747732,2.561917
14,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.207569,0.750961,2.566384
7,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.20852,0.750213,2.566911


In [None]:
df_analy.sort_values(["NSS"])

Unnamed: 0,object,action,filename,output_sentence,top_token_text,following_text,clip_input,KLD,SIM,NSS
8,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.833763,0.45651,2.224909
12,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.82919,0.458288,2.231138
2,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.830226,0.458051,2.231992
3,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.826313,0.459504,2.232696
1,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.823134,0.460721,2.232715
9,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.829356,0.458138,2.233707
0,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.828423,0.458777,2.233982
18,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.827386,0.458944,2.235275
4,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.827909,0.4585,2.235811
7,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.823897,0.460539,2.236242
