In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os

from scipy.stats import pearsonr
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation

clip_processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
clip_model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from config import AGD20K_PATH, model_name

from VLM_model_dot_relative import MetricsTracker
from file_managing import (
    load_selected_samples,
    get_actual_path,
    get_gt_path,
    load_ground_truth,
    prompt_dict_obj,
    get_clipseg_heatmap,
    calculate_metrics,
    prompt_dict_obj
)

def min_max_normalize(arr):
    denom = arr.max() - arr.min()
    if denom == 0:
        return np.zeros_like(arr)
    return (arr - arr.min()) / (denom + 1e-8)



from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
model_name= f"Qwen/Qwen3-VL-32B-Instruct"
processor = AutoProcessor.from_pretrained(model_name)
tok = processor.tokenizer

AGD20K_PATH

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


'/home/DATA/AGD20K'

In [2]:
import numpy as np

def check_heatmap_containment(heatmap_top, heatmap_obj, threshold=0.15, containment_ratio=0.8):
    """
    Args:
        containment_ratio (float): Top 영역의 몇 % 이상이 Obj와 겹쳐야 포함으로 볼 것인지 (기본 0.9 = 90%)
    """
    
    # 1. 텐서인 경우 numpy 변환
    if hasattr(heatmap_top, 'cpu'):
        heatmap_top = heatmap_top.detach().cpu().numpy()
    if hasattr(heatmap_obj, 'cpu'):
        heatmap_obj = heatmap_obj.detach().cpu().numpy()

    # 2. 이진 마스크 생성
    mask_top = heatmap_top > threshold
    mask_obj = heatmap_obj > threshold

    # 3. 면적 계산
    area_top = np.sum(mask_top)
    area_obj = np.sum(mask_obj)

    # 예외 처리: Top 히트맵이 아예 활성화되지 않은 경우 (면적 0)
    if area_top == 0:
        return False

    # 조건 1: Top의 면적이 Object 면적보다 작은가?
    is_smaller = area_top < area_obj
    
    # 4. 포함 관계 확인 (수정된 부분)
    # 교집합(Intersection) 영역 계산
    intersection = np.logical_and(mask_top, mask_obj)
    intersection_area = np.sum(intersection)

    # [수정됨] 교집합 면적이 Top 전체 면적의 90% 이상인지 확인
    # (intersection_area / area_top) >= 0.9 와 동일한 수식입니다.
    is_inside = intersection_area >= (area_top * containment_ratio)

    # 디버깅용: 실제 겹치는 비율 확인
    # print(f"Overlap Ratio: {intersection_area / area_top:.2f}")

    return is_smaller and is_inside

In [11]:
metrics_tracker_alloutput = MetricsTracker(name="all_output")
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

# 저장할 디렉토리 생성
Layername = "clipseg_top1"
output_dir = f"./output_{Layername}"  # 디렉토리 이름 변경 (구분 위해)s
os.makedirs(output_dir, exist_ok=True)

# Contrastive Subtraction 강도 설정 (0.5 ~ 1.0 추천)

POS_ALPHA = 0
results_list = []
pos_map = np.zeros((31, 31), dtype=np.float32)
for i in range(24):
    i += 1
    pkl_path = f"/home/bongo/porter_notebook/research/qwen3/AttentionHeads/output_results/attention_result_full_output_32B_{i}.pkl"
    if not os.path.exists(pkl_path):
        continue
        
    df_output = pd.read_pickle(pkl_path)

    for idx, row in df_output.iterrows():
        sum_heatmap = np.zeros((31, 31), dtype=np.float32)

        object_name = row['object']
        action = row['action']
        filename = row['filename']
        output_description = row['output_sentence']
        output_attentions = row['output_attentions']
        PLSP_name = prompt_dict_obj[action][object_name]
        
        file_name_real = f"{AGD20K_PATH}/Seen/testset/egocentric/{action}/{object_name}/{filename}"
        gt_path = f"{AGD20K_PATH}/Seen/testset/GT/{action}/{object_name}/{filename.split('.')[0]}.png"
            
        # 이미지 로드
        if not os.path.exists(file_name_real):
            print(f"Image not found: {file_name_real}")
            continue

        orig_img = cv2.imread(file_name_real)
        orig_img = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
        h, w, _ = orig_img.shape

        print(f"[{idx}] Processing: {action} - {object_name}")

        # --- 1. CLIPSeg Mask 생성 ---

        clip_object_heatmap = get_clipseg_heatmap(
            file_name_real,
            clip_model,
            clip_processor,
            object_name,
        )

        # CLIPSeg 결과를 31x31로 리사이즈
        clip_heatmap_resized = cv2.resize(clip_object_heatmap, (31, 31), interpolation=cv2.INTER_LINEAR)
        clip_binary_mask = (clip_heatmap_resized > 0.15).astype(np.float32) # 필요시 사용

        # --- 2. [변경] Contrastive Attention Map 계산 ---
        token_scores = []
        token_idx = 0
        for token in output_attentions:
            # 토큰별 히트맵 초기화
            token_heatmap = np.zeros((31, 31), dtype=np.float32)
            token_heatmap_masked = np.zeros((31, 31), dtype=np.float32)
            token_head_count = 0
            
            attention_value = token['attentions']
            decoded_str = token['token_str'] # 디버깅용

            for each_attention in attention_value:
                sum_heatmap += each_attention['heatmap']
                layer = each_attention['layer']
                head = each_attention['head']
                
                token_heatmap += each_attention['heatmap']
                token_head_count += 1
            
            # 해당 레이어의 헤드가 하나도 없으면 스킵
            if token_head_count == 0:
                continue

            # Visual Dependency Score (S_img) 계산: 맵의 총합
            s_img = token_heatmap.sum()
            s_img_masked = (token_heatmap * clip_binary_mask).sum()

            # s_img = np.dot(flat_token, flat_clip) / (norm_token * norm_clip)
            
            # 리스트에 저장
            token_scores.append({
                "token": decoded_str,
                "token_idx" : token_idx,
                "score": s_img_masked,
                "score_ori": s_img,
                "heatmap": token_heatmap, 
                "count": token_head_count
            })
            token_idx +=1
        # 예외 처리: 토큰이 없을 경우
        if len(token_scores) == 0:
            print("No valid tokens found.")
            continue

        # 정렬 (Score 기준 오름차순)
        sorted_tokens = sorted(token_scores, key=lambda x: x['score'])
        
        # Top 5 (Signal) & Bottom 5 (Noise) 선정
        # 토큰 개수가 10개 미만일 경우 처리
        num_select = min(1, len(sorted_tokens) // 2)
        if num_select < 1: num_select = 1 # 최소 1개

        bottom_tokens = sorted_tokens[:num_select]       # Noise (기능어, 배경 등)
        top_tokens = sorted_tokens[-num_select:][::-1]   # Signal (명사, 핵심어)

        top_token_idx  = top_tokens[-1]['token_idx']
        top_token_text  = top_tokens[-1]['token']

        following_token_idx = top_token_idx + 1
        following_token = None
        for item in token_scores:
            if item['token_idx'] == following_token_idx:
                following_token = item
                break
        following_text = following_token['token']

        clip_top_heatmap = get_clipseg_heatmap(
            file_name_real,
            clip_model,
            clip_processor,
            top_token_text + ' ' + following_text,
            # PLSP_name
        )
        pos_map = np.sum([t['heatmap'] for t in top_tokens], axis=0)
        pos_map /= len(top_tokens)
        
        # 정규화 (스케일 맞추기 위해 0~1로 변환 후 뺄셈 진행)
        if pos_map.max() > 0: pos_map /= pos_map.max()
        avg_norm = pos_map.copy()
        h_min, h_max = pos_map.min(), pos_map.max()
        avg_norm = (pos_map - h_min) / (h_max - h_min + 1e-8)


        if check_heatmap_containment(clip_top_heatmap,clip_object_heatmap):
            clip_heatmap = clip_top_heatmap
            clipseg_input_text = top_token_text+ ' ' + following_text
            print(f"Selected CLIP input : {clipseg_input_text}")
            # Signal Map (Positive) 평균

        else:
            clip_heatmap = clip_object_heatmap
            clipseg_input_text = object_name

        clip_heatmap_resized = cv2.resize(clip_heatmap, (31, 31), interpolation=cv2.INTER_LINEAR)
        avg_norm_cliped = avg_norm * clip_heatmap_resized
        gamma =  0.75  # 0
        avg_norm_cliped = np.power(avg_norm_cliped, gamma)
        # 리사이즈 및 블러링
        avg_norm_cliped_rescaled = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)
        
        sig = min(w, h) * 0.05
        k_val = int(sig * 3) * 2 + 1 
        kernel_size = (k_val, k_val)

        # 블러 적용
        blur_map = cv2.GaussianBlur(avg_norm_cliped_rescaled, kernel_size, sig)

        # 블러 후 다시 정규화
        blur_map = min_max_normalize(blur_map) # 함수가 정의되어 있다고 가정
        avg_norm_cliped_blur = blur_map
        
        # 시각화를 위해 31x31 맵도 원본 크기로 리사이즈
        avg_norm_resized_vis = cv2.resize(avg_norm, (w, h), interpolation=cv2.INTER_LINEAR)
        clip_vis = cv2.resize(clip_heatmap_resized, (w, h), interpolation=cv2.INTER_NEAREST)

        # --- 4. GT 평가 및 메트릭 계산 ---
        gt_map = load_ground_truth(gt_path) # 함수 정의 가정
        if gt_map is not None:
            metrics_dino = calculate_metrics(avg_norm_cliped_blur, gt_map) # 함수 정의 가정
            metrics_tracker_alloutput.update(metrics_dino) # 객체 정의 가정
            
            # 메트릭 텍스트
            metrics_text = f"[{object_name} {action}] KLD: {metrics_dino['KLD']:.4f} | SIM: {metrics_dino['SIM']:.4f} | NSS: {metrics_dino['NSS']:.4f}"
            metrics_tracker_alloutput.print_metrics(metrics_dino, filename)
        else:
            print("NO GT!!!")
            metrics_text = "No GT Available"
            continue
        
        results_list.append({
            'object': object_name,
            'action': action,
            'filename': filename,
            'output_sentence': output_description,
            'top_token_text': top_token_text,
            'following_text': following_text,
            'clip_input': clipseg_input_text,
            'KLD': metrics_dino['KLD'],
            'SIM': metrics_dino['SIM'],
            'NSS': metrics_dino['NSS']
        })

        # --- 5. 시각화 ---
        fig, axes = plt.subplots(1, 6, figsize=(24, 5)) # 사이즈 살짝 조정
        
        # Signal 단어와 Noise 단어 표시 (제목용)
        top_words = ",".join([f"'{t['token'].strip()}'" for t in top_tokens[:5]])
        
        main_title = f"Obj: {object_name} | Act: {action} |{metrics_text}\nTop Tokens: [{top_words}({top_token_idx } ), clipseg input : {top_token_text} {following_text}] \n Whole answer : {output_description}"
        fig.suptitle(main_title, fontsize=14, fontweight='bold', y=0.98)

        # (1) 원본 이미지
        axes[0].imshow(orig_img)
        axes[0].set_title(f"Original\n({object_name})")
        axes[0].axis('off')

        # (2) Contrastive Attention (Pos - Neg)
        im1 = axes[1].imshow(avg_norm_resized_vis, cmap='jet', interpolation='bilinear')
        axes[1].set_title(f"Attention Map {Layername}")
        axes[1].axis('off')
        plt.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04)

        # (3) CLIPSeg Result
        axes[2].imshow(clip_vis, cmap='gray')
        axes[2].set_title(f"CLIPSeg {clipseg_input_text}")
        axes[2].axis('off')

        # (4) Hadamard (Contrastive x CLIPSeg)
        # 리사이즈하여 시각화
        hadamard_vis = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)
        im3 = axes[3].imshow(hadamard_vis, cmap='jet', interpolation='bilinear')
        axes[3].set_title("Hadamard\n(Contrastive x CLIP)")
        axes[3].axis('off')
        plt.colorbar(im3, ax=axes[3], fraction=0.046, pad=0.04)

        # (5) Final Blurred Result
        im4 = axes[4].imshow(avg_norm_cliped_blur, cmap='jet', interpolation='bilinear')
        axes[4].set_title("Final Blurred")
        axes[4].axis('off')
        plt.colorbar(im4, ax=axes[4], fraction=0.046, pad=0.04)

        # (6) Ground Truth
        axes[5].imshow(gt_map, cmap='gray') # GT는 보통 binary 혹은 gray
        axes[5].set_title("Ground Truth")
        axes[5].axis('off')

        # 파일 저장
        save_path = os.path.join(output_dir, f"{object_name}_{action}_{filename.split('.')[0]}.png")
        plt.tight_layout()
        plt.savefig(save_path, bbox_inches='tight', dpi=150)
        plt.close(fig)

df_analy = pd.DataFrame(results_list)


[0] Processing: cut - apple


  return self.preprocess(images, **kwargs)



Metrics for all_output apple_000054.jpg:
 all_output Current - KLD: 0.8334 | SIM: 0.6088 | NSS: 0.6762

Cumulative all_output  Averages over 1 samples:
Average - KLD: 0.8334 | SIM: 0.6088 | NSS: 0.6762

[1] Processing: eat - apple
Selected CLIP input :  flesh  of

Metrics for all_output apple_001541.jpg:
 all_output Current - KLD: 0.2388 | SIM: 0.7479 | NSS: 1.1965

Cumulative all_output  Averages over 2 samples:
Average - KLD: 0.5361 | SIM: 0.6784 | NSS: 0.9364

[2] Processing: peel - apple
Selected CLIP input :  colored  layer

Metrics for all_output apple_001541.jpg:
 all_output Current - KLD: 0.1669 | SIM: 0.7830 | NSS: 1.1020

Cumulative all_output  Averages over 3 samples:
Average - KLD: 0.4131 | SIM: 0.7133 | NSS: 0.9916

[3] Processing: hit - axe
Selected CLIP input :  blade  (

Metrics for all_output axe_000961.jpg:
 all_output Current - KLD: 0.9041 | SIM: 0.4284 | NSS: 1.1537

Cumulative all_output  Averages over 4 samples:
Average - KLD: 0.5358 | SIM: 0.6420 | NSS: 1.0321



  return self.preprocess(images, **kwargs)


Selected CLIP input :  handle  (

Metrics for all_output badminton_racket_002255.jpg:
 all_output Current - KLD: 0.8766 | SIM: 0.4337 | NSS: 2.0747

Cumulative all_output  Averages over 6 samples:
Average - KLD: 0.5670 | SIM: 0.6093 | NSS: 1.4580

[6] Processing: swing - badminton_racket
Selected CLIP input : inton  racket

Metrics for all_output badminton_racket_003649.jpg:
 all_output Current - KLD: 5.0593 | SIM: 0.0110 | NSS: -0.4230

Cumulative all_output  Averages over 7 samples:
Average - KLD: 1.2088 | SIM: 0.5239 | NSS: 1.1892

[7] Processing: cut - banana

Metrics for all_output banana_002623.jpg:
 all_output Current - KLD: 0.1876 | SIM: 0.7674 | NSS: 1.4367

Cumulative all_output  Averages over 8 samples:
Average - KLD: 1.0811 | SIM: 0.5543 | NSS: 1.2202

[8] Processing: eat - banana

Metrics for all_output banana_002458.jpg:
 all_output Current - KLD: 0.1092 | SIM: 0.8322 | NSS: 1.7512

Cumulative all_output  Averages over 9 samples:
Average - KLD: 0.9731 | SIM: 0.5852 | NSS:

  return self.preprocess(images, **kwargs)



Metrics for all_output baseball_002670.jpg:
 all_output Current - KLD: 0.2493 | SIM: 0.7717 | NSS: 3.3594

Cumulative all_output  Averages over 11 samples:
Average - KLD: 0.9395 | SIM: 0.5785 | NSS: 1.4425

[11] Processing: hit - baseball_bat
Selected CLIP input :  is  the

Metrics for all_output baseball_bat_001882.jpg:
 all_output Current - KLD: 1.7571 | SIM: 0.2016 | NSS: 0.9191

Cumulative all_output  Averages over 12 samples:
Average - KLD: 1.0077 | SIM: 0.5471 | NSS: 1.3989

[12] Processing: hold - baseball_bat

Metrics for all_output baseball_bat_002547.jpg:
 all_output Current - KLD: 2.1838 | SIM: 0.1434 | NSS: 1.4824

Cumulative all_output  Averages over 13 samples:
Average - KLD: 1.0981 | SIM: 0.5160 | NSS: 1.4053

[13] Processing: swing - baseball_bat

Metrics for all_output baseball_bat_001882.jpg:
 all_output Current - KLD: 2.0169 | SIM: 0.1656 | NSS: 0.4258

Cumulative all_output  Averages over 14 samples:
Average - KLD: 1.1638 | SIM: 0.4910 | NSS: 1.3354

[14] Processin

  return self.preprocess(images, **kwargs)


Selected CLIP input :  mattress  is

Metrics for all_output bed_002880.jpg:
 all_output Current - KLD: 0.4077 | SIM: 0.6428 | NSS: 2.8591

Cumulative all_output  Averages over 16 samples:
Average - KLD: 1.0750 | SIM: 0.5076 | NSS: 1.5665

[16] Processing: sit_on - bed
Selected CLIP input :  the  mattress

Metrics for all_output bed_003622.jpg:
 all_output Current - KLD: 0.4249 | SIM: 0.6231 | NSS: 1.7428

Cumulative all_output  Averages over 17 samples:
Average - KLD: 1.0368 | SIM: 0.5144 | NSS: 1.5769

[17] Processing: lie_on - bench

Metrics for all_output bench_003727.jpg:
 all_output Current - KLD: 0.7333 | SIM: 0.5000 | NSS: 1.2494

Cumulative all_output  Averages over 18 samples:
Average - KLD: 1.0199 | SIM: 0.5136 | NSS: 1.5587

[18] Processing: sit_on - bench

Metrics for all_output bench_001877.jpg:
 all_output Current - KLD: 0.0721 | SIM: 0.8719 | NSS: 1.8440

Cumulative all_output  Averages over 19 samples:
Average - KLD: 0.9700 | SIM: 0.5324 | NSS: 1.5737

[19] Processing: 

  return self.preprocess(images, **kwargs)


Selected CLIP input :  seat  of

Metrics for all_output bicycle_003046.jpg:
 all_output Current - KLD: 0.9712 | SIM: 0.4250 | NSS: 1.6790

Cumulative all_output  Averages over 21 samples:
Average - KLD: 1.0502 | SIM: 0.5072 | NSS: 1.5056

[21] Processing: sit_on - bicycle
Selected CLIP input :  seat  (

Metrics for all_output bicycle_002100.jpg:
 all_output Current - KLD: 2.0466 | SIM: 0.1698 | NSS: 1.2674

Cumulative all_output  Averages over 22 samples:
Average - KLD: 1.0955 | SIM: 0.4919 | NSS: 1.4948

[22] Processing: look_out - binoculars
Selected CLIP input :  smaller  lenses

Metrics for all_output binoculars_003630.jpg:
 all_output Current - KLD: 1.9722 | SIM: 0.2092 | NSS: -0.1403

Cumulative all_output  Averages over 23 samples:
Average - KLD: 1.1336 | SIM: 0.4796 | NSS: 1.4237

[23] Processing: hold - book

Metrics for all_output book_001195.jpg:
 all_output Current - KLD: 0.7542 | SIM: 0.5019 | NSS: 1.2021

Cumulative all_output  Averages over 24 samples:
Average - KLD: 1.1

  return self.preprocess(images, **kwargs)


Selected CLIP input :  the  bottle

Metrics for all_output bottle_003259.jpg:
 all_output Current - KLD: 1.5836 | SIM: 0.2269 | NSS: 1.9349

Cumulative all_output  Averages over 26 samples:
Average - KLD: 1.1183 | SIM: 0.4719 | NSS: 1.4759

[26] Processing: hold - bottle

Metrics for all_output bottle_001227.jpg:
 all_output Current - KLD: 0.3552 | SIM: 0.6784 | NSS: 1.9411

Cumulative all_output  Averages over 27 samples:
Average - KLD: 1.0900 | SIM: 0.4796 | NSS: 1.4932

[27] Processing: open - bottle
Selected CLIP input :  cap  or

Metrics for all_output bottle_001033.jpg:
 all_output Current - KLD: 1.9420 | SIM: 0.1661 | NSS: 1.2531

Cumulative all_output  Averages over 28 samples:
Average - KLD: 1.1204 | SIM: 0.4684 | NSS: 1.4846

[28] Processing: pour - bottle
Selected CLIP input :  narrow  sp

Metrics for all_output bottle_002780.jpg:
 all_output Current - KLD: 1.5520 | SIM: 0.2752 | NSS: 0.4279

Cumulative all_output  Averages over 29 samples:
Average - KLD: 1.1353 | SIM: 0.461

  return self.preprocess(images, **kwargs)



Metrics for all_output bowl_000134.jpg:
 all_output Current - KLD: 1.1660 | SIM: 0.4030 | NSS: 0.5601

Cumulative all_output  Averages over 31 samples:
Average - KLD: 1.1038 | SIM: 0.4706 | NSS: 1.4016

[31] Processing: wash - bowl

Metrics for all_output bowl_002825.jpg:
 all_output Current - KLD: 0.8018 | SIM: 0.5046 | NSS: 0.4879

Cumulative all_output  Averages over 32 samples:
Average - KLD: 1.0944 | SIM: 0.4717 | NSS: 1.3730

[32] Processing: eat - broccoli

Metrics for all_output broccoli_002796.jpg:
 all_output Current - KLD: 0.1765 | SIM: 0.7720 | NSS: 1.6093

Cumulative all_output  Averages over 33 samples:
Average - KLD: 1.0666 | SIM: 0.4808 | NSS: 1.3802

[33] Processing: take_photo - camera
Selected CLIP input :  the  camera

Metrics for all_output camera_002534.jpg:
 all_output Current - KLD: 0.6938 | SIM: 0.5367 | NSS: 0.4049

Cumulative all_output  Averages over 34 samples:
Average - KLD: 1.0556 | SIM: 0.4824 | NSS: 1.3515

[34] Processing: cut - carrot

Metrics for al

  return self.preprocess(images, **kwargs)



Metrics for all_output carrot_001443.jpg:
 all_output Current - KLD: 0.7252 | SIM: 0.5100 | NSS: 3.0505

Cumulative all_output  Averages over 36 samples:
Average - KLD: 1.0401 | SIM: 0.4830 | NSS: 1.4369

[36] Processing: peel - carrot

Metrics for all_output carrot_003707.jpg:
 all_output Current - KLD: 0.2542 | SIM: 0.7270 | NSS: 2.1059

Cumulative all_output  Averages over 37 samples:
Average - KLD: 1.0188 | SIM: 0.4896 | NSS: 1.4549

[37] Processing: take_photo - cell_phone
Selected CLIP input :  camera  lens

Metrics for all_output cell_phone_000601.jpg:
 all_output Current - KLD: 0.4549 | SIM: 0.6261 | NSS: 1.5143

Cumulative all_output  Averages over 38 samples:
Average - KLD: 1.0040 | SIM: 0.4932 | NSS: 1.4565

[38] Processing: talk_on - cell_phone

Metrics for all_output cell_phone_000601.jpg:
 all_output Current - KLD: 0.6625 | SIM: 0.5256 | NSS: 1.4945

Cumulative all_output  Averages over 39 samples:
Average - KLD: 0.9952 | SIM: 0.4940 | NSS: 1.4575

[39] Processing: text_

  return self.preprocess(images, **kwargs)


Selected CLIP input :  cushion  of

Metrics for all_output chair_002839.jpg:
 all_output Current - KLD: 1.0170 | SIM: 0.3974 | NSS: 0.8004

Cumulative all_output  Averages over 41 samples:
Average - KLD: 0.9909 | SIM: 0.4911 | NSS: 1.4448

[41] Processing: lie_on - couch
Selected CLIP input :  seat  and

Metrics for all_output couch_003293.jpg:
 all_output Current - KLD: 0.7022 | SIM: 0.5044 | NSS: 1.5780

Cumulative all_output  Averages over 42 samples:
Average - KLD: 0.9840 | SIM: 0.4914 | NSS: 1.4479

[42] Processing: sit_on - couch
Selected CLIP input :  seat  cushions

Metrics for all_output couch_000779.jpg:
 all_output Current - KLD: 0.7742 | SIM: 0.4744 | NSS: 1.9378

Cumulative all_output  Averages over 43 samples:
Average - KLD: 0.9791 | SIM: 0.4910 | NSS: 1.4593

[43] Processing: drink_with - cup

Metrics for all_output cup_000508.jpg:
 all_output Current - KLD: 0.5549 | SIM: 0.5606 | NSS: 2.7144

Cumulative all_output  Averages over 44 samples:
Average - KLD: 0.9695 | SIM: 

  return self.preprocess(images, **kwargs)



Metrics for all_output cup_001535.jpg:
 all_output Current - KLD: 1.2661 | SIM: 0.3585 | NSS: 1.9958

Cumulative all_output  Averages over 46 samples:
Average - KLD: 0.9698 | SIM: 0.4909 | NSS: 1.5231

[46] Processing: sip - cup

Metrics for all_output cup_001864.jpg:
 all_output Current - KLD: 0.4944 | SIM: 0.6032 | NSS: 0.9434

Cumulative all_output  Averages over 47 samples:
Average - KLD: 0.9597 | SIM: 0.4933 | NSS: 1.5108

[47] Processing: wash - cup

Metrics for all_output cup_003621.jpg:
 all_output Current - KLD: 1.0558 | SIM: 0.3659 | NSS: 1.8641

Cumulative all_output  Averages over 48 samples:
Average - KLD: 0.9617 | SIM: 0.4907 | NSS: 1.5181

[48] Processing: throw - discus
Selected CLIP input :  metal  hub

Metrics for all_output discus_003558.jpg:
 all_output Current - KLD: 0.5092 | SIM: 0.5983 | NSS: 0.6829

Cumulative all_output  Averages over 49 samples:
Average - KLD: 0.9524 | SIM: 0.4929 | NSS: 1.5011

[49] Processing: beat - drum

Metrics for all_output drum_002586

  return self.preprocess(images, **kwargs)


Selected CLIP input :  handle  of

Metrics for all_output fork_000804.jpg:
 all_output Current - KLD: 0.6820 | SIM: 0.5235 | NSS: 1.9527

Cumulative all_output  Averages over 51 samples:
Average - KLD: 0.9471 | SIM: 0.4921 | NSS: 1.4850

[51] Processing: lift - fork

Metrics for all_output fork_001691.jpg:
 all_output Current - KLD: 1.5281 | SIM: 0.2634 | NSS: 0.3242

Cumulative all_output  Averages over 52 samples:
Average - KLD: 0.9583 | SIM: 0.4877 | NSS: 1.4627

[52] Processing: stick - fork

Metrics for all_output fork_000095.jpg:
 all_output Current - KLD: 0.7510 | SIM: 0.4895 | NSS: 1.9462

Cumulative all_output  Averages over 53 samples:
Average - KLD: 0.9544 | SIM: 0.4878 | NSS: 1.4718

[53] Processing: wash - fork

Metrics for all_output fork_001691.jpg:
 all_output Current - KLD: 0.8478 | SIM: 0.4628 | NSS: 1.9234

Cumulative all_output  Averages over 54 samples:
Average - KLD: 0.9524 | SIM: 0.4873 | NSS: 1.4801

[54] Processing: catch - frisbee
Selected CLIP input : edges  

  return self.preprocess(images, **kwargs)


Selected CLIP input :  a  fr

Metrics for all_output frisbee_001130.jpg:
 all_output Current - KLD: 0.9350 | SIM: 0.4188 | NSS: -0.2984

Cumulative all_output  Averages over 56 samples:
Average - KLD: 0.9479 | SIM: 0.4864 | NSS: 1.4228

[56] Processing: throw - frisbee

Metrics for all_output frisbee_003249.jpg:
 all_output Current - KLD: 0.6222 | SIM: 0.5485 | NSS: 0.4913

Cumulative all_output  Averages over 57 samples:
Average - KLD: 0.9422 | SIM: 0.4875 | NSS: 1.4065

[57] Processing: hold - golf_clubs
Selected CLIP input :  black ,

Metrics for all_output golf_clubs_000045.jpg:
 all_output Current - KLD: 0.8900 | SIM: 0.4729 | NSS: 1.6259

Cumulative all_output  Averages over 58 samples:
Average - KLD: 0.9413 | SIM: 0.4872 | NSS: 1.4103

[58] Processing: swing - golf_clubs

Metrics for all_output golf_clubs_001992.jpg:
 all_output Current - KLD: 2.3558 | SIM: 0.1373 | NSS: 0.4142

Cumulative all_output  Averages over 59 samples:
Average - KLD: 0.9652 | SIM: 0.4813 | NSS: 1.3934

[

  return self.preprocess(images, **kwargs)


Selected CLIP input :  handle  (

Metrics for all_output hammer_000215.jpg:
 all_output Current - KLD: 1.0827 | SIM: 0.3544 | NSS: 2.3231

Cumulative all_output  Averages over 61 samples:
Average - KLD: 0.9805 | SIM: 0.4747 | NSS: 1.3911

[61] Processing: eat - hot_dog

Metrics for all_output hot_dog_002166.jpg:
 all_output Current - KLD: 0.3096 | SIM: 0.6895 | NSS: 1.0854

Cumulative all_output  Averages over 62 samples:
Average - KLD: 0.9696 | SIM: 0.4782 | NSS: 1.3862

[62] Processing: throw - javelin
Selected CLIP input :  yellow  or

Metrics for all_output javelin_001474.jpg:
 all_output Current - KLD: 0.3226 | SIM: 0.7294 | NSS: 3.6087

Cumulative all_output  Averages over 63 samples:
Average - KLD: 0.9594 | SIM: 0.4822 | NSS: 1.4215

[63] Processing: type_on - keyboard

Metrics for all_output keyboard_000439.jpg:
 all_output Current - KLD: 0.2410 | SIM: 0.7356 | NSS: 1.3397

Cumulative all_output  Averages over 64 samples:
Average - KLD: 0.9482 | SIM: 0.4861 | NSS: 1.4202

[64] 

  return self.preprocess(images, **kwargs)



Metrics for all_output knife_002682.jpg:
 all_output Current - KLD: 0.5273 | SIM: 0.6288 | NSS: 2.9902

Cumulative all_output  Averages over 66 samples:
Average - KLD: 0.9348 | SIM: 0.4901 | NSS: 1.4567

[66] Processing: stick - knife

Metrics for all_output knife_001072.jpg:
 all_output Current - KLD: 1.5174 | SIM: 0.2483 | NSS: 1.9480

Cumulative all_output  Averages over 67 samples:
Average - KLD: 0.9435 | SIM: 0.4865 | NSS: 1.4640

[67] Processing: wash - knife

Metrics for all_output knife_002720.jpg:
 all_output Current - KLD: 0.3930 | SIM: 0.6873 | NSS: 2.9624

Cumulative all_output  Averages over 68 samples:
Average - KLD: 0.9354 | SIM: 0.4895 | NSS: 1.4860

[68] Processing: type_on - laptop
Selected CLIP input :  keyboard  is

Metrics for all_output laptop_000585.jpg:
 all_output Current - KLD: 0.5253 | SIM: 0.5943 | NSS: 1.9482

Cumulative all_output  Averages over 69 samples:
Average - KLD: 0.9295 | SIM: 0.4910 | NSS: 1.4927

[69] Processing: open - microwave
Selected CLIP 

  return self.preprocess(images, **kwargs)


Selected CLIP input :  rear  wheel

Metrics for all_output motorcycle_003541.jpg:
 all_output Current - KLD: 2.3869 | SIM: 0.1484 | NSS: 0.0825

Cumulative all_output  Averages over 71 samples:
Average - KLD: 0.9525 | SIM: 0.4846 | NSS: 1.4666

[71] Processing: ride - motorcycle
Selected CLIP input :  seat  of

Metrics for all_output motorcycle_002198.jpg:
 all_output Current - KLD: 1.1581 | SIM: 0.3682 | NSS: 1.4468

Cumulative all_output  Averages over 72 samples:
Average - KLD: 0.9554 | SIM: 0.4830 | NSS: 1.4663

[72] Processing: sit_on - motorcycle
Selected CLIP input :  the  seat

Metrics for all_output motorcycle_000837.jpg:
 all_output Current - KLD: 1.1213 | SIM: 0.3463 | NSS: 3.0604

Cumulative all_output  Averages over 73 samples:
Average - KLD: 0.9577 | SIM: 0.4811 | NSS: 1.4882

[73] Processing: cut - orange

Metrics for all_output orange_001193.jpg:
 all_output Current - KLD: 0.8730 | SIM: 0.5801 | NSS: 1.2030

Cumulative all_output  Averages over 74 samples:
Average - KLD

  return self.preprocess(images, **kwargs)



Metrics for all_output orange_001193.jpg:
 all_output Current - KLD: 0.8562 | SIM: 0.5846 | NSS: 1.2088

Cumulative all_output  Averages over 76 samples:
Average - KLD: 0.9533 | SIM: 0.4854 | NSS: 1.4784

[76] Processing: wash - orange

Metrics for all_output orange_001193.jpg:
 all_output Current - KLD: 0.6870 | SIM: 0.6413 | NSS: 1.2487

Cumulative all_output  Averages over 77 samples:
Average - KLD: 0.9498 | SIM: 0.4874 | NSS: 1.4754

[77] Processing: open - oven

Metrics for all_output oven_001370.jpg:
 all_output Current - KLD: 1.1927 | SIM: 0.3395 | NSS: 1.1986

Cumulative all_output  Averages over 78 samples:
Average - KLD: 0.9529 | SIM: 0.4855 | NSS: 1.4718

[78] Processing: write - pen

Metrics for all_output pen_003590.jpg:
 all_output Current - KLD: 1.3988 | SIM: 0.2838 | NSS: 1.3488

Cumulative all_output  Averages over 79 samples:
Average - KLD: 0.9586 | SIM: 0.4830 | NSS: 1.4703

[79] Processing: boxing - punching_bag
Selected CLIP input :  red  central

Metrics for all_

  return self.preprocess(images, **kwargs)


Selected CLIP input :  the  punching

Metrics for all_output punching_bag_001639.jpg:
 all_output Current - KLD: 0.8855 | SIM: 0.4345 | NSS: 0.5756

Cumulative all_output  Averages over 81 samples:
Average - KLD: 0.9511 | SIM: 0.4842 | NSS: 1.4492

[81] Processing: open - refrigerator
Selected CLIP input :  the  **

Metrics for all_output refrigerator_002162.jpg:
 all_output Current - KLD: 1.0834 | SIM: 0.3736 | NSS: 1.1050

Cumulative all_output  Averages over 82 samples:
Average - KLD: 0.9527 | SIM: 0.4829 | NSS: 1.4450

[82] Processing: catch - rugby_ball
Selected CLIP input :  brown  panels

Metrics for all_output rugby_ball_003522.jpg:
 all_output Current - KLD: 0.3486 | SIM: 0.6960 | NSS: 0.6167

Cumulative all_output  Averages over 83 samples:
Average - KLD: 0.9454 | SIM: 0.4854 | NSS: 1.4350

[83] Processing: kick - rugby_ball
Selected CLIP input :  a  rugby

Metrics for all_output rugby_ball_002080.jpg:
 all_output Current - KLD: 0.1966 | SIM: 0.7709 | NSS: 0.9354

Cumulative 

  return self.preprocess(images, **kwargs)


Selected CLIP input :  blades  of

Metrics for all_output scissors_002479.jpg:
 all_output Current - KLD: 1.4852 | SIM: 0.2570 | NSS: 1.2752

Cumulative all_output  Averages over 86 samples:
Average - KLD: 0.9353 | SIM: 0.4888 | NSS: 1.4210

[86] Processing: hold - scissors

Metrics for all_output scissors_002479.jpg:
 all_output Current - KLD: 0.4447 | SIM: 0.6500 | NSS: 1.8065

Cumulative all_output  Averages over 87 samples:
Average - KLD: 0.9296 | SIM: 0.4907 | NSS: 1.4255

[87] Processing: carry - skateboard

Metrics for all_output skateboard_002668.jpg:
 all_output Current - KLD: 0.1699 | SIM: 0.7947 | NSS: 0.8651

Cumulative all_output  Averages over 88 samples:
Average - KLD: 0.9210 | SIM: 0.4942 | NSS: 1.4191

[88] Processing: hold - skateboard
Selected CLIP input :  the  grip

Metrics for all_output skateboard_002387.jpg:
 all_output Current - KLD: 0.5496 | SIM: 0.5855 | NSS: 1.0549

Cumulative all_output  Averages over 89 samples:
Average - KLD: 0.9168 | SIM: 0.4952 | NSS: 1

  return self.preprocess(images, **kwargs)


Selected CLIP input :  the  grip

Metrics for all_output skateboard_001460.jpg:
 all_output Current - KLD: 0.1988 | SIM: 0.7835 | NSS: 1.1397

Cumulative all_output  Averages over 91 samples:
Average - KLD: 0.9110 | SIM: 0.4971 | NSS: 1.4087

[91] Processing: carry - skis
Selected CLIP input :  the  **

Metrics for all_output skis_002829.jpg:
 all_output Current - KLD: 2.3193 | SIM: 0.1363 | NSS: -0.1861

Cumulative all_output  Averages over 92 samples:
Average - KLD: 0.9263 | SIM: 0.4931 | NSS: 1.3914

[92] Processing: hold - skis
Selected CLIP input :  tips  of

Metrics for all_output skis_001357.jpg:
 all_output Current - KLD: 2.3867 | SIM: 0.1243 | NSS: 0.3840

Cumulative all_output  Averages over 93 samples:
Average - KLD: 0.9420 | SIM: 0.4892 | NSS: 1.3806

[93] Processing: jump - skis
Selected CLIP input :  the  entire

Metrics for all_output skis_002829.jpg:
 all_output Current - KLD: 2.4030 | SIM: 0.1249 | NSS: -0.2187

Cumulative all_output  Averages over 94 samples:
Average 

  return self.preprocess(images, **kwargs)


Selected CLIP input : rounded  tips

Metrics for all_output snowboard_001325.jpg:
 all_output Current - KLD: 1.9482 | SIM: 0.1907 | NSS: 1.1588

Cumulative all_output  Averages over 96 samples:
Average - KLD: 0.9840 | SIM: 0.4784 | NSS: 1.3500

[96] Processing: hold - snowboard
Selected CLIP input :  a  snow

Metrics for all_output snowboard_001704.jpg:
 all_output Current - KLD: 1.8541 | SIM: 0.1968 | NSS: -0.1414

Cumulative all_output  Averages over 97 samples:
Average - KLD: 0.9930 | SIM: 0.4755 | NSS: 1.3346

[97] Processing: jump - snowboard
Selected CLIP input :  a  snow

Metrics for all_output snowboard_001704.jpg:
 all_output Current - KLD: 1.2079 | SIM: 0.3606 | NSS: 0.0402

Cumulative all_output  Averages over 98 samples:
Average - KLD: 0.9952 | SIM: 0.4743 | NSS: 1.3214

[98] Processing: catch - soccer_ball

Metrics for all_output soccer_ball_003333.jpg:
 all_output Current - KLD: 0.0949 | SIM: 0.8561 | NSS: 1.3586

Cumulative all_output  Averages over 99 samples:
Average -

  return self.preprocess(images, **kwargs)


Selected CLIP input :  the  bottom

Metrics for all_output suitcase_002998.jpg:
 all_output Current - KLD: 4.0167 | SIM: 0.0311 | NSS: -0.2673

Cumulative all_output  Averages over 101 samples:
Average - KLD: 1.0071 | SIM: 0.4778 | NSS: 1.3157

[101] Processing: hold - suitcase
Selected CLIP input : opic  handle

Metrics for all_output suitcase_003687.jpg:
 all_output Current - KLD: 1.1052 | SIM: 0.3780 | NSS: 1.7749

Cumulative all_output  Averages over 102 samples:
Average - KLD: 1.0081 | SIM: 0.4769 | NSS: 1.3202

[102] Processing: open - suitcase
Selected CLIP input :  zipper  is

Metrics for all_output suitcase_000520.jpg:
 all_output Current - KLD: 2.2725 | SIM: 0.1536 | NSS: -0.1863

Cumulative all_output  Averages over 103 samples:
Average - KLD: 1.0203 | SIM: 0.4737 | NSS: 1.3056

[103] Processing: pack - suitcase
Selected CLIP input :  leopard  print

Metrics for all_output suitcase_002212.jpg:
 all_output Current - KLD: 1.1077 | SIM: 0.3634 | NSS: 0.3752

Cumulative all_outp

  return self.preprocess(images, **kwargs)


Selected CLIP input :  black  circular

Metrics for all_output surfboard_002422.jpg:
 all_output Current - KLD: 2.9562 | SIM: 0.0820 | NSS: -0.3632

Cumulative all_output  Averages over 106 samples:
Average - KLD: 1.0362 | SIM: 0.4697 | NSS: 1.2987

[106] Processing: hold - surfboard

Metrics for all_output surfboard_002631.jpg:
 all_output Current - KLD: 0.7178 | SIM: 0.4974 | NSS: 2.6803

Cumulative all_output  Averages over 107 samples:
Average - KLD: 1.0332 | SIM: 0.4699 | NSS: 1.3116

[107] Processing: jump - surfboard

Metrics for all_output surfboard_000658.jpg:
 all_output Current - KLD: 0.8878 | SIM: 0.4480 | NSS: 0.3956

Cumulative all_output  Averages over 108 samples:
Average - KLD: 1.0319 | SIM: 0.4697 | NSS: 1.3032

[108] Processing: lie_on - surfboard

Metrics for all_output surfboard_000221.jpg:
 all_output Current - KLD: 0.1053 | SIM: 0.8404 | NSS: 2.2709

Cumulative all_output  Averages over 109 samples:
Average - KLD: 1.0234 | SIM: 0.4731 | NSS: 1.3120

[109] Process

  return self.preprocess(images, **kwargs)



Metrics for all_output tennis_racket_002268.jpg:
 all_output Current - KLD: 3.4202 | SIM: 0.0595 | NSS: -0.4407

Cumulative all_output  Averages over 111 samples:
Average - KLD: 1.0371 | SIM: 0.4722 | NSS: 1.2871

[111] Processing: hold - tennis_racket
Selected CLIP input :  handle  of

Metrics for all_output tennis_racket_001785.jpg:
 all_output Current - KLD: 0.4499 | SIM: 0.6185 | NSS: 3.3179

Cumulative all_output  Averages over 112 samples:
Average - KLD: 1.0319 | SIM: 0.4735 | NSS: 1.3052

[112] Processing: swing - tennis_racket
Selected CLIP input :  handle  of

Metrics for all_output tennis_racket_003066.jpg:
 all_output Current - KLD: 0.8117 | SIM: 0.4755 | NSS: 2.0633

Cumulative all_output  Averages over 113 samples:
Average - KLD: 1.0299 | SIM: 0.4735 | NSS: 1.3119

[113] Processing: brush_with - toothbrush
Selected CLIP input :  at  the

Metrics for all_output toothbrush_001764.jpg:
 all_output Current - KLD: 1.8930 | SIM: 0.1881 | NSS: 1.8102

Cumulative all_output  Aver

  return self.preprocess(images, **kwargs)



Metrics for all_output toothbrush_001991.jpg:
 all_output Current - KLD: 1.0471 | SIM: 0.3739 | NSS: 2.1957

Cumulative all_output  Averages over 116 samples:
Average - KLD: 1.0409 | SIM: 0.4685 | NSS: 1.3256

[116] Processing: drink_with - wine_glass
Selected CLIP input :  bowl  of

Metrics for all_output wine_glass_003343.jpg:
 all_output Current - KLD: 1.4712 | SIM: 0.2582 | NSS: 1.0559

Cumulative all_output  Averages over 117 samples:
Average - KLD: 1.0446 | SIM: 0.4667 | NSS: 1.3233

[117] Processing: hold - wine_glass
Selected CLIP input :  its  stem

Metrics for all_output wine_glass_002374.jpg:
 all_output Current - KLD: 1.1629 | SIM: 0.4283 | NSS: 1.2365

Cumulative all_output  Averages over 118 samples:
Average - KLD: 1.0456 | SIM: 0.4664 | NSS: 1.3226

[118] Processing: pour - wine_glass

Metrics for all_output wine_glass_000186.jpg:
 all_output Current - KLD: 1.2017 | SIM: 0.3336 | NSS: 1.0309

Cumulative all_output  Averages over 119 samples:
Average - KLD: 1.0469 | SIM:

In [None]:
Average - KLD: 1.0486 | SIM: 0.4630 | NSS: 1.3187  민맥스 안하고 그냥 맥스분에일한거

In [None]:
TOP 90 : Average - KLD: 1.1244 | SIM: 0.4453 | NSS: 1.2122
TOP 95 : Average - KLD: 1.1112 | SIM: 0.4485 | NSS: 1.2430
TOP 99 : KLD: 1.0911 | SIM: 0.4547 | NSS: 1.2855
TOP 99 Bottom 50 : KLD: 1.0911 | SIM: 0.4547 | NSS: 1.2855
TOP 80 Bottom 50 :KLD: 1.1480 | SIM: 0.4391 | NSS: 1.1538
ALL_token_exp075 :KLD: 1.2934 | SIM: 0.3905 | NSS: 0.9825
TOP1 : 1.0901 | SIM: 0.4564 | NSS: 1.2929
TOP1_exp0.75 : KLD: 1.0885 | SIM: 0.4532 | NSS: 1.2754
CLIPSEG_TOP1_exp0.75 : 1.0672 | SIM: 0.4499 | NSS: 1.3048
CLIPSEG_TOP12_exp0.75 : KLD: 1.0653 | SIM: 0.4484 | NSS: 1.3006
CLIPSEG_TOP123_exp0.75 :  1.0725 | SIM: 0.4469 | NSS: 1.2891

CLIPSEG_TOP1_GT_exp0.75 :  KLD: 0.9955 | SIM: 0.4649 | NSS: 1.3919



TOP1_real_exp0.5 : KLD: 1.1299 | SIM: 0.4168 | NSS: 1.3019
CLIP_fillter top1 :  KLD: 1.0788 | SIM: 0.4702 | NSS: 1.2851
CLIP_fillter top1+following :  KLD: 1.0782 | SIM: 0.4732 | NSS: 1.2873
CLIP_fillter top1+following_exp0.3 : KLD: 1.0730 | SIM: 0.4661 | NSS: 1.2646
CLIP_fillter top1+following_exp0.75 :  KLD: 1.0625 | SIM: 0.4734 | NSS: 1.2900
CLIP_fillter top1+following_exp0.75_real :  KLD: 1.0474 | SIM: 0.4607 | NSS: 1.3293
CLIP_fillter top1+following_exp0.75_real_namable :  KLD: 1.1120 | SIM: 0.4399 | NSS: 1.2498
CLIP_fillter top1+following_ININ_exp0.75_real :  KLD: 1.0567 | SIM: 0.4571 | NSS: 1.3125
CLIP_fillter top1+following_exp0.5_real :  KLD: 1.0970 | SIM: 0.4276 | NSS: 1.3465
CLIP_fillter top1+following_exp2 :  Average - KLD: 1.5235 | SIM: 0.4005 | NSS: 0.9573

CLIP_fillter PLSP_exp0.75_real :  KLD: 1.0015 | SIM: 0.4608 | NSS: 1.4040