In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os

from scipy.stats import pearsonr
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation

clip_processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
clip_model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from config import AGD20K_PATH, model_name

from VLM_model_dot_relative import MetricsTracker
from file_managing import (
    load_selected_samples,
    get_actual_path,
    get_gt_path,
    load_ground_truth,
    prompt_dict_obj,
    get_clipseg_heatmap,
    calculate_metrics,
    prompt_dict_obj
)

def min_max_normalize(arr):
    denom = arr.max() - arr.min()
    if denom == 0:
        return np.zeros_like(arr)
    return (arr - arr.min()) / (denom + 1e-8)



from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
model_name= f"Qwen/Qwen3-VL-32B-Instruct"
processor = AutoProcessor.from_pretrained(model_name)
tok = processor.tokenizer

AGD20K_PATH

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


'/home/DATA/AGD20K'

In [2]:
import numpy as np

def check_heatmap_containment(heatmap_top, heatmap_obj, threshold=0.15, containment_ratio=0.8):
    """
    Args:
        containment_ratio (float): Top 영역의 몇 % 이상이 Obj와 겹쳐야 포함으로 볼 것인지 (기본 0.9 = 90%)
    """
    
    # 1. 텐서인 경우 numpy 변환
    if hasattr(heatmap_top, 'cpu'):
        heatmap_top = heatmap_top.detach().cpu().numpy()
    if hasattr(heatmap_obj, 'cpu'):
        heatmap_obj = heatmap_obj.detach().cpu().numpy()

    # 2. 이진 마스크 생성
    mask_top = heatmap_top > threshold
    mask_obj = heatmap_obj > threshold

    # 3. 면적 계산
    area_top = np.sum(mask_top)
    area_obj = np.sum(mask_obj)

    # 예외 처리: Top 히트맵이 아예 활성화되지 않은 경우 (면적 0)
    if area_top == 0:
        return False

    # 조건 1: Top의 면적이 Object 면적보다 작은가?
    is_smaller = area_top < area_obj
    
    # 4. 포함 관계 확인 (수정된 부분)
    # 교집합(Intersection) 영역 계산
    intersection = np.logical_and(mask_top, mask_obj)
    intersection_area = np.sum(intersection)

    # [수정됨] 교집합 면적이 Top 전체 면적의 90% 이상인지 확인
    # (intersection_area / area_top) >= 0.9 와 동일한 수식입니다.
    is_inside = intersection_area >= (area_top * containment_ratio)

    # 디버깅용: 실제 겹치는 비율 확인
    # print(f"Overlap Ratio: {intersection_area / area_top:.2f}")

    return is_smaller and is_inside

In [None]:
metrics_tracker_alloutput = MetricsTracker(name="all_output")
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

# 저장할 디렉토리 생성
Layername = "2_top1_exp75"
output_dir = f"./exo_{Layername}"  # 디렉토리 이름 변경 (구분 위해)s
os.makedirs(output_dir, exist_ok=True)

# Contrastive Subtraction 강도 설정 (0.5 ~ 1.0 추천)
patch_size = 24
POS_ALPHA = 0
results_list = []
pos_map = np.zeros((patch_size, patch_size), dtype=np.float32)
for i in range(24):
    i += 1
    pkl_path = f"output_results2/exo_attention_result_32B_2_{i}.pkl"
    if not os.path.exists(pkl_path):
        continue
        
    df_output = pd.read_pickle(pkl_path)

    for idx, row in df_output.iterrows():
        object_name = row['object']
        action = row['action']
        filename = row['filename']
        output_description = row['output_sentence']
        output_attentions = row['output_attentions']
        PLSP_name = prompt_dict_obj[action][object_name]
        exo_name =   os.path.basename(row['random_exo_filename'])

        sum_heatmap = np.zeros((patch_size, patch_size), dtype=np.float32)
        
        exo_img = cv2.imread(row['random_exo_filename'])
        exo_img = cv2.cvtColor(exo_img, cv2.COLOR_BGR2RGB)
        
        
        file_name_real = f"{AGD20K_PATH}/Seen/testset/egocentric/{action}/{object_name}/{filename}"
        gt_path = f"{AGD20K_PATH}/Seen/testset/GT/{action}/{object_name}/{filename.split('.')[0]}.png"
            
        # 이미지 로드
        if not os.path.exists(file_name_real):
            print(f"Image not found: {file_name_real}")
            continue

        orig_img = cv2.imread(file_name_real)
        orig_img = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
        h, w, _ = orig_img.shape

        print(f"[{idx}] Processing: {action} - {object_name}")

        # --- 2. [변경] Contrastive Attention Map 계산 ---
        token_scores = []
        token_idx = 0
        for token in output_attentions:
            # 토큰별 히트맵 초기화
            token_heatmap = np.zeros((patch_size, patch_size), dtype=np.float32)
            token_head_count = 0
            
            attention_value = token['attentions']
            decoded_str = token['token_str'] # 디버깅용

            for each_attention in attention_value:
                layer = each_attention['layer']
                head = each_attention['head']
                
                # # 사용자가 설정한 특정 레이어 필터링 (여기선 Layer 0 유지)
                # if each_attention['layer'] != 0:
                if 1==1: # (layer <=45) and (layer >= 20): #1==1: # (layer == 26) : #and( head ==20)) : #or ((layer == 24) and( head ==31)):
                    sum_heatmap += each_attention['heatmap']
                    token_heatmap += each_attention['heatmap']
                    token_head_count += 1

            
            # 해당 레이어의 헤드가 하나도 없으면 스킵
            if token_head_count == 0:
                continue

            # Visual Dependency Score (S_img) 계산: 맵의 총합
            s_img = token_heatmap.sum()

            # 리스트에 저장
            token_scores.append({
                "token": decoded_str,
                "token_idx" : token_idx,
                "score": s_img,
                "heatmap": token_heatmap, 
                "count": token_head_count
            })
            token_idx +=1
        # 예외 처리: 토큰이 없을 경우
        if len(token_scores) == 0:
            print("No valid tokens found.")
            continue

        # 정렬 (Score 기준 오름차순)
        sorted_tokens = sorted(token_scores, key=lambda x: x['score'])
        
        # Top 5 (Signal) & Bottom 5 (Noise) 선정
        # 토큰 개수가 10개 미만일 경우 처리
        num_select = min(1, len(sorted_tokens) // 2)
        if num_select < 1: num_select = 1 # 최소 1개

        bottom_tokens = sorted_tokens[:num_select]       # Noise (기능어, 배경 등)
        top_tokens = sorted_tokens[-num_select:][::-1]   # Signal (명사, 핵심어)

        top_token_idx  = top_tokens[-1]['token_idx']
        top_token_text  = top_tokens[-1]['token']

        following_token_idx = top_token_idx + 1
        following_token = None
        for item in token_scores:
            if item['token_idx'] == following_token_idx:
                following_token = item
                break
        following_text = following_token['token']

        clip_object_heatmap = get_clipseg_heatmap(
            file_name_real,
            clip_model,
            clip_processor,
            object_name,
        )

        clip_top_heatmap = get_clipseg_heatmap(
            file_name_real,
            clip_model,
            clip_processor,
            top_token_text + ' ' + following_text,
        )
        pos_map = np.sum([t['heatmap'] for t in top_tokens], axis=0)
        pos_map /= len(top_tokens)
        
        # Noise Map (Negative) 평균
        neg_map = np.sum([t['heatmap'] for t in bottom_tokens], axis=0)
        neg_map /= len(bottom_tokens)



        # 정규화 (스케일 맞추기 위해 0~1로 변환 후 뺄셈 진행)
        if pos_map.max() > 0: pos_map /= pos_map.max()
        if neg_map.max() > 0: neg_map /= neg_map.max()

        # ✨ Contrastive Subtraction (Signal - alpha * Noise)
        CONTRASTIVE_ALPHA = 0
        contrastive_heatmap = (pos_map) - (CONTRASTIVE_ALPHA * neg_map)
        # contrastive_heatmap = sum_heatmap / (token_head_count + 1e-8)
        # --- 3. 정규화 및 후처리 (기존 코드 흐름 연결) ---
        # Contrastive Map을 avg_norm 변수로 사용 (0~1 정규화)
        h_min, h_max = contrastive_heatmap.min(), contrastive_heatmap.max()

        
        avg_norm = (contrastive_heatmap - h_min) / (h_max - h_min + 1e-8)

        if check_heatmap_containment(clip_top_heatmap,clip_object_heatmap):
            clip_heatmap = clip_top_heatmap
            clipseg_input_text = top_token_text+ ' ' + following_text
            print(f"Selected CLIP input : {top_token_text}")
            # Signal Map (Positive) 평균



        else:
            clip_heatmap = clip_object_heatmap
            clipseg_input_text = object_name
            # h_min, h_max = token_heatmap.min(), token_heatmap.max()
            # avg_norm = (token_heatmap - h_min) / (h_max - h_min + 1e-8)

            print(f"Selected CLIP input : {object_name}")

        clip_heatmap_resized = cv2.resize(clip_heatmap, (patch_size, patch_size), interpolation=cv2.INTER_LINEAR)
        clip_binary_mask = (clip_heatmap_resized > 0.15).astype(np.float32) # 필요시 사용


        # CLIPSeg와 Hadamard Product
        avg_norm_cliped = avg_norm * clip_heatmap_resized
        gamma =  0.75  # 0
        avg_norm_cliped = np.power(avg_norm_cliped, gamma)
        # 리사이즈 및 블러링
        avg_norm_cliped_rescaled = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)
        
        sig = min(w, h) * 0.05
        k_val = int(sig * 3) * 2 + 1 
        kernel_size = (k_val, k_val)

        # 블러 적용
        blur_map = cv2.GaussianBlur(avg_norm_cliped_rescaled, kernel_size, sig)

        # 블러 후 다시 정규화
        blur_map = min_max_normalize(blur_map) # 함수가 정의되어 있다고 가정
        avg_norm_cliped_blur = blur_map
        
        # 시각화를 위해 31x31 맵도 원본 크기로 리사이즈
        avg_norm_resized_vis = cv2.resize(avg_norm, (w, h), interpolation=cv2.INTER_LINEAR)
        clip_vis = cv2.resize(clip_heatmap_resized, (w, h), interpolation=cv2.INTER_NEAREST)

        # --- 4. GT 평가 및 메트릭 계산 ---
        gt_map = load_ground_truth(gt_path) # 함수 정의 가정
        if gt_map is not None:
            metrics_dino = calculate_metrics(avg_norm_cliped_blur, gt_map) # 함수 정의 가정
            metrics_tracker_alloutput.update(metrics_dino) # 객체 정의 가정
            
            # 메트릭 텍스트
            metrics_text = f"[{object_name} {action}] KLD: {metrics_dino['KLD']:.4f} | SIM: {metrics_dino['SIM']:.4f} | NSS: {metrics_dino['NSS']:.4f}"
            metrics_tracker_alloutput.print_metrics(metrics_dino, filename)
        else:
            print("NO GT!!!")
            metrics_text = "No GT Available"
            continue
        
        results_list.append({
            'object': object_name,
            'action': action,
            'filename': filename,
            'output_sentence': output_description,
            'top_token_text': top_token_text,
            'following_text': following_text,
            'clip_input': clipseg_input_text,
            'KLD': metrics_dino['KLD'],
            'SIM': metrics_dino['SIM'],
            'NSS': metrics_dino['NSS']
        })

        # --- 5. 시각화 ---
        fig, axes = plt.subplots(1, 7, figsize=(24, 5)) # 사이즈 살짝 조정
        
        # Signal 단어와 Noise 단어 표시 (제목용)
        top_words = ",".join([f"'{t['token'].strip()}'" for t in top_tokens[:5]])
        
        main_title = f"Obj: {object_name} | Act: {action} |{metrics_text}\nTop Tokens: [{top_words}({top_token_idx } ), clipseg input : {top_token_text} {following_text}] \n Whole answer : {output_description}"
        fig.suptitle(main_title, fontsize=14, fontweight='bold', y=0.98)

        # (1) 원본 이미지
        axes[0].imshow(orig_img)
        axes[0].set_title(f"Original\n({object_name})")
        axes[0].axis('off')

        # (2) Contrastive Attention (Pos - Neg)
        im1 = axes[1].imshow(avg_norm_resized_vis, cmap='jet', interpolation='bilinear')
        axes[1].set_title(f"Attention Map {Layername}")
        axes[1].axis('off')
        plt.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04)

        # (3) CLIPSeg Result
        axes[2].imshow(clip_vis, cmap='gray')
        axes[2].set_title(f"CLIPSeg {clipseg_input_text}")
        axes[2].axis('off')

        # (4) Hadamard (Contrastive x CLIPSeg)
        # 리사이즈하여 시각화
        hadamard_vis = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)
        im3 = axes[3].imshow(hadamard_vis, cmap='jet', interpolation='bilinear')
        axes[3].set_title("Hadamard\n(Contrastive x CLIP)")
        axes[3].axis('off')
        plt.colorbar(im3, ax=axes[3], fraction=0.046, pad=0.04)

        # (5) Final Blurred Result
        im4 = axes[4].imshow(avg_norm_cliped_blur, cmap='jet', interpolation='bilinear')
        axes[4].set_title("Final Blurred")
        axes[4].axis('off')
        plt.colorbar(im4, ax=axes[4], fraction=0.046, pad=0.04)

        # (6) Ground Truth
        axes[5].imshow(gt_map, cmap='gray') # GT는 보통 binary 혹은 gray
        axes[5].set_title("Ground Truth")
        axes[5].axis('off')

        # (6) exo Image
        axes[6].imshow(exo_img, cmap='gray') # GT는 보통 binary 혹은 gray
        axes[6].set_title("Ground Truth")
        axes[6].axis('off')


        # 파일 저장
        save_path = os.path.join(output_dir, f"exo_{object_name}_{action}_{filename.split('.')[0]}_{exo_name.split('.')[0]}.png")
        plt.tight_layout()
        plt.savefig(save_path, bbox_inches='tight', dpi=150)
        plt.close(fig)

df_analy = pd.DataFrame(results_list)
df_analy.to_pickle("2_exo_attention_top1.pkl")
        # print(f"clipseg_input_text : {clipseg_input_text}")

[0] Processing: cut - apple


  return self.preprocess(images, **kwargs)


Selected CLIP input : apple

Metrics for all_output apple_000054.jpg:
 all_output Current - KLD: 0.6868 | SIM: 0.6305 | NSS: 0.7139

Cumulative all_output  Averages over 1 samples:
Average - KLD: 0.6868 | SIM: 0.6305 | NSS: 0.7139

[1] Processing: eat - apple
Selected CLIP input :  the

Metrics for all_output apple_001541.jpg:
 all_output Current - KLD: 0.0485 | SIM: 0.8789 | NSS: 1.2721

Cumulative all_output  Averages over 2 samples:
Average - KLD: 0.3676 | SIM: 0.7547 | NSS: 0.9930

[2] Processing: peel - apple
Selected CLIP input : apple

Metrics for all_output apple_001541.jpg:
 all_output Current - KLD: 0.1676 | SIM: 0.7868 | NSS: 1.3753

Cumulative all_output  Averages over 3 samples:
Average - KLD: 0.3010 | SIM: 0.7654 | NSS: 1.1204

[3] Processing: hit - axe
Selected CLIP input :  curved

Metrics for all_output axe_000961.jpg:
 all_output Current - KLD: 2.6712 | SIM: 0.1018 | NSS: -0.2334

Cumulative all_output  Averages over 4 samples:
Average - KLD: 0.8935 | SIM: 0.5995 | NS

  return self.preprocess(images, **kwargs)


Selected CLIP input :  handle

Metrics for all_output badminton_racket_002255.jpg:
 all_output Current - KLD: 0.8326 | SIM: 0.4569 | NSS: 2.2282

Cumulative all_output  Averages over 6 samples:
Average - KLD: 0.7691 | SIM: 0.6009 | NSS: 1.3202

[6] Processing: swing - badminton_racket
Selected CLIP input : badminton_racket

Metrics for all_output badminton_racket_003649.jpg:
 all_output Current - KLD: 3.4653 | SIM: 0.0437 | NSS: -0.3587

Cumulative all_output  Averages over 7 samples:
Average - KLD: 1.1543 | SIM: 0.5213 | NSS: 1.0803

[7] Processing: cut - banana
Selected CLIP input : banana

Metrics for all_output banana_002623.jpg:
 all_output Current - KLD: 0.1011 | SIM: 0.8332 | NSS: 1.4134

Cumulative all_output  Averages over 8 samples:
Average - KLD: 1.0226 | SIM: 0.5603 | NSS: 1.1220

[8] Processing: eat - banana
Selected CLIP input : banana

Metrics for all_output banana_002458.jpg:
 all_output Current - KLD: 0.3266 | SIM: 0.6819 | NSS: 1.6803

Cumulative all_output  Averages 

  return self.preprocess(images, **kwargs)


Selected CLIP input : baseball

Metrics for all_output baseball_002670.jpg:
 all_output Current - KLD: 0.1526 | SIM: 0.8481 | NSS: 3.3807

Cumulative all_output  Averages over 11 samples:
Average - KLD: 0.9161 | SIM: 0.5748 | NSS: 1.3584

[11] Processing: hit - baseball_bat
Selected CLIP input : baseball_bat

Metrics for all_output baseball_bat_001882.jpg:
 all_output Current - KLD: 2.2722 | SIM: 0.1382 | NSS: 0.1849

Cumulative all_output  Averages over 12 samples:
Average - KLD: 1.0291 | SIM: 0.5384 | NSS: 1.2606

[12] Processing: hold - baseball_bat
Selected CLIP input : baseball_bat

Metrics for all_output baseball_bat_002547.jpg:
 all_output Current - KLD: 2.4875 | SIM: 0.1159 | NSS: 0.5922

Cumulative all_output  Averages over 13 samples:
Average - KLD: 1.1413 | SIM: 0.5059 | NSS: 1.2092

[13] Processing: swing - baseball_bat
Selected CLIP input : baseball_bat

Metrics for all_output baseball_bat_001882.jpg:
 all_output Current - KLD: 2.2096 | SIM: 0.1461 | NSS: 0.2499

Cumulativ

  return self.preprocess(images, **kwargs)


Selected CLIP input :  gray

Metrics for all_output bed_002880.jpg:
 all_output Current - KLD: 0.2175 | SIM: 0.7502 | NSS: 2.9233

Cumulative all_output  Averages over 16 samples:
Average - KLD: 1.1488 | SIM: 0.4907 | NSS: 1.3362

[16] Processing: sit_on - bed
Selected CLIP input :  blue

Metrics for all_output bed_003622.jpg:
 all_output Current - KLD: 0.5706 | SIM: 0.5783 | NSS: 1.6768

Cumulative all_output  Averages over 17 samples:
Average - KLD: 1.1148 | SIM: 0.4958 | NSS: 1.3562

[17] Processing: lie_on - bench
Selected CLIP input :  by

Metrics for all_output bench_003727.jpg:
 all_output Current - KLD: 1.3515 | SIM: 0.3781 | NSS: 0.7633

Cumulative all_output  Averages over 18 samples:
Average - KLD: 1.1279 | SIM: 0.4893 | NSS: 1.3233

[18] Processing: sit_on - bench
Selected CLIP input : bench

Metrics for all_output bench_001877.jpg:
 all_output Current - KLD: 0.1507 | SIM: 0.7902 | NSS: 1.8560

Cumulative all_output  Averages over 19 samples:
Average - KLD: 1.0765 | SIM: 0.

  return self.preprocess(images, **kwargs)


Selected CLIP input :  seat

Metrics for all_output bicycle_003046.jpg:
 all_output Current - KLD: 0.9201 | SIM: 0.4791 | NSS: 1.5760

Cumulative all_output  Averages over 21 samples:
Average - KLD: 1.1504 | SIM: 0.4847 | NSS: 1.2960

[21] Processing: sit_on - bicycle
Selected CLIP input :  the

Metrics for all_output bicycle_002100.jpg:
 all_output Current - KLD: 1.4445 | SIM: 0.2989 | NSS: 2.3524

Cumulative all_output  Averages over 22 samples:
Average - KLD: 1.1638 | SIM: 0.4763 | NSS: 1.3441

[22] Processing: look_out - binoculars
Selected CLIP input : binoculars

Metrics for all_output binoculars_003630.jpg:
 all_output Current - KLD: 1.0479 | SIM: 0.3935 | NSS: 0.5802

Cumulative all_output  Averages over 23 samples:
Average - KLD: 1.1587 | SIM: 0.4727 | NSS: 1.3108

[23] Processing: hold - book
Selected CLIP input : book

Metrics for all_output book_001195.jpg:
 all_output Current - KLD: 0.7563 | SIM: 0.5065 | NSS: 1.1489

Cumulative all_output  Averages over 24 samples:
Averag

  return self.preprocess(images, **kwargs)


Selected CLIP input :  the

Metrics for all_output bottle_003259.jpg:
 all_output Current - KLD: 1.7136 | SIM: 0.2093 | NSS: 1.6973

Cumulative all_output  Averages over 26 samples:
Average - KLD: 1.1481 | SIM: 0.4651 | NSS: 1.3571

[26] Processing: hold - bottle
Selected CLIP input : bottle

Metrics for all_output bottle_001227.jpg:
 all_output Current - KLD: 0.6119 | SIM: 0.5410 | NSS: 1.9510

Cumulative all_output  Averages over 27 samples:
Average - KLD: 1.1282 | SIM: 0.4679 | NSS: 1.3791

[27] Processing: open - bottle
Selected CLIP input :  green

Metrics for all_output bottle_001033.jpg:
 all_output Current - KLD: 0.4656 | SIM: 0.6252 | NSS: 3.7235

Cumulative all_output  Averages over 28 samples:
Average - KLD: 1.1046 | SIM: 0.4735 | NSS: 1.4629

[28] Processing: pour - bottle
Selected CLIP input :  pink

Metrics for all_output bottle_002780.jpg:
 all_output Current - KLD: 4.2887 | SIM: 0.0571 | NSS: -0.2626

Cumulative all_output  Averages over 29 samples:
Average - KLD: 1.214

  return self.preprocess(images, **kwargs)


Selected CLIP input :  creamy

Metrics for all_output bowl_000134.jpg:
 all_output Current - KLD: 1.0495 | SIM: 0.4813 | NSS: 0.8977

Cumulative all_output  Averages over 31 samples:
Average - KLD: 1.1773 | SIM: 0.4690 | NSS: 1.3705

[31] Processing: wash - bowl
Selected CLIP input : bowl

Metrics for all_output bowl_002825.jpg:
 all_output Current - KLD: 0.7662 | SIM: 0.5081 | NSS: 0.4828

Cumulative all_output  Averages over 32 samples:
Average - KLD: 1.1645 | SIM: 0.4702 | NSS: 1.3428

[32] Processing: eat - broccoli
Selected CLIP input : broccoli

Metrics for all_output broccoli_002796.jpg:
 all_output Current - KLD: 0.1378 | SIM: 0.8218 | NSS: 1.6541

Cumulative all_output  Averages over 33 samples:
Average - KLD: 1.1334 | SIM: 0.4809 | NSS: 1.3522

[33] Processing: take_photo - camera
Selected CLIP input :  the

Metrics for all_output camera_002534.jpg:
 all_output Current - KLD: 0.7341 | SIM: 0.5042 | NSS: 0.3773

Cumulative all_output  Averages over 34 samples:
Average - KLD: 1

  return self.preprocess(images, **kwargs)


Selected CLIP input : carrot

Metrics for all_output carrot_001443.jpg:
 all_output Current - KLD: 0.7906 | SIM: 0.4916 | NSS: 2.8625

Cumulative all_output  Averages over 36 samples:
Average - KLD: 1.1072 | SIM: 0.4809 | NSS: 1.4066

[36] Processing: peel - carrot
Selected CLIP input : carrot

Metrics for all_output carrot_003707.jpg:
 all_output Current - KLD: 0.5758 | SIM: 0.5534 | NSS: 2.0183

Cumulative all_output  Averages over 37 samples:
Average - KLD: 1.0929 | SIM: 0.4829 | NSS: 1.4231

[37] Processing: take_photo - cell_phone
Selected CLIP input :  the

Metrics for all_output cell_phone_000601.jpg:
 all_output Current - KLD: 0.3870 | SIM: 0.6479 | NSS: 1.5962

Cumulative all_output  Averages over 38 samples:
Average - KLD: 1.0743 | SIM: 0.4872 | NSS: 1.4277

[38] Processing: talk_on - cell_phone
Selected CLIP input : cell_phone

Metrics for all_output cell_phone_000601.jpg:
 all_output Current - KLD: 0.6337 | SIM: 0.5337 | NSS: 1.5027

Cumulative all_output  Averages over 39 

  return self.preprocess(images, **kwargs)


Selected CLIP input :  blue

Metrics for all_output chair_002839.jpg:
 all_output Current - KLD: 1.0213 | SIM: 0.3803 | NSS: 0.6478

Cumulative all_output  Averages over 41 samples:
Average - KLD: 1.0586 | SIM: 0.4845 | NSS: 1.4083

[41] Processing: lie_on - couch
Selected CLIP input : couch

Metrics for all_output couch_003293.jpg:
 all_output Current - KLD: 0.8194 | SIM: 0.4649 | NSS: 1.4543

Cumulative all_output  Averages over 42 samples:
Average - KLD: 1.0529 | SIM: 0.4840 | NSS: 1.4094

[42] Processing: sit_on - couch
Selected CLIP input :  three

Metrics for all_output couch_000779.jpg:
 all_output Current - KLD: 0.8506 | SIM: 0.4467 | NSS: 1.5929

Cumulative all_output  Averages over 43 samples:
Average - KLD: 1.0482 | SIM: 0.4831 | NSS: 1.4137

[43] Processing: drink_with - cup
Selected CLIP input : cup

Metrics for all_output cup_000508.jpg:
 all_output Current - KLD: 0.5905 | SIM: 0.5385 | NSS: 2.5362

Cumulative all_output  Averages over 44 samples:
Average - KLD: 1.0378 | 

  return self.preprocess(images, **kwargs)


Selected CLIP input : cup

Metrics for all_output cup_001535.jpg:
 all_output Current - KLD: 1.6752 | SIM: 0.2794 | NSS: 1.3684

Cumulative all_output  Averages over 46 samples:
Average - KLD: 1.0564 | SIM: 0.4767 | NSS: 1.4461

[46] Processing: sip - cup
Selected CLIP input :  the

Metrics for all_output cup_001864.jpg:
 all_output Current - KLD: 0.3649 | SIM: 0.6663 | NSS: 1.0845

Cumulative all_output  Averages over 47 samples:
Average - KLD: 1.0417 | SIM: 0.4808 | NSS: 1.4384

[47] Processing: wash - cup
Selected CLIP input :  decorated

Metrics for all_output cup_003621.jpg:
 all_output Current - KLD: 2.7428 | SIM: 0.1157 | NSS: -0.1340

Cumulative all_output  Averages over 48 samples:
Average - KLD: 1.0772 | SIM: 0.4732 | NSS: 1.4056

[48] Processing: throw - discus
Selected CLIP input :  metallic

Metrics for all_output discus_003558.jpg:
 all_output Current - KLD: 0.5172 | SIM: 0.5922 | NSS: 0.6706

Cumulative all_output  Averages over 49 samples:
Average - KLD: 1.0657 | SIM: 0

  return self.preprocess(images, **kwargs)


Selected CLIP input :  handle

Metrics for all_output fork_000804.jpg:
 all_output Current - KLD: 0.7146 | SIM: 0.5054 | NSS: 1.9396

Cumulative all_output  Averages over 51 samples:
Average - KLD: 1.0533 | SIM: 0.4763 | NSS: 1.3862

[51] Processing: lift - fork
Selected CLIP input : fork

Metrics for all_output fork_001691.jpg:
 all_output Current - KLD: 1.6293 | SIM: 0.2320 | NSS: 0.1815

Cumulative all_output  Averages over 52 samples:
Average - KLD: 1.0644 | SIM: 0.4716 | NSS: 1.3630

[52] Processing: stick - fork
Selected CLIP input : fork

Metrics for all_output fork_000095.jpg:
 all_output Current - KLD: 0.6982 | SIM: 0.5058 | NSS: 2.0132

Cumulative all_output  Averages over 53 samples:
Average - KLD: 1.0575 | SIM: 0.4723 | NSS: 1.3753

[53] Processing: wash - fork
Selected CLIP input :  the

Metrics for all_output fork_001691.jpg:
 all_output Current - KLD: 0.9591 | SIM: 0.4053 | NSS: 2.1540

Cumulative all_output  Averages over 54 samples:
Average - KLD: 1.0557 | SIM: 0.4710 

  return self.preprocess(images, **kwargs)


Selected CLIP input :  star

Metrics for all_output frisbee_001130.jpg:
 all_output Current - KLD: 2.0018 | SIM: 0.2101 | NSS: -0.5185

Cumulative all_output  Averages over 56 samples:
Average - KLD: 1.0682 | SIM: 0.4664 | NSS: 1.3292

[56] Processing: throw - frisbee
Selected CLIP input :  solid

Metrics for all_output frisbee_003249.jpg:
 all_output Current - KLD: 0.7029 | SIM: 0.5099 | NSS: 0.3768

Cumulative all_output  Averages over 57 samples:
Average - KLD: 1.0618 | SIM: 0.4671 | NSS: 1.3125

[57] Processing: hold - golf_clubs
Selected CLIP input :  black

Metrics for all_output golf_clubs_000045.jpg:
 all_output Current - KLD: 0.7653 | SIM: 0.5367 | NSS: 1.6625

Cumulative all_output  Averages over 58 samples:
Average - KLD: 1.0567 | SIM: 0.4683 | NSS: 1.3186

[58] Processing: swing - golf_clubs
Selected CLIP input : golf_clubs

Metrics for all_output golf_clubs_001992.jpg:
 all_output Current - KLD: 2.1338 | SIM: 0.1562 | NSS: 0.6689

Cumulative all_output  Averages over 59 sa

  return self.preprocess(images, **kwargs)


Selected CLIP input :  handle

Metrics for all_output hammer_000215.jpg:
 all_output Current - KLD: 0.9788 | SIM: 0.3882 | NSS: 2.5001

Cumulative all_output  Averages over 61 samples:
Average - KLD: 1.0866 | SIM: 0.4574 | NSS: 1.3102

[61] Processing: eat - hot_dog
Selected CLIP input : hot_dog

Metrics for all_output hot_dog_002166.jpg:
 all_output Current - KLD: 0.1889 | SIM: 0.7694 | NSS: 1.1958

Cumulative all_output  Averages over 62 samples:
Average - KLD: 1.0721 | SIM: 0.4624 | NSS: 1.3084

[62] Processing: throw - javelin
Selected CLIP input :  yellow

Metrics for all_output javelin_001474.jpg:
 all_output Current - KLD: 0.1957 | SIM: 0.8225 | NSS: 3.6327

Cumulative all_output  Averages over 63 samples:
Average - KLD: 1.0582 | SIM: 0.4681 | NSS: 1.3453

[63] Processing: type_on - keyboard
Selected CLIP input : keyboard

Metrics for all_output keyboard_000439.jpg:
 all_output Current - KLD: 0.2796 | SIM: 0.7237 | NSS: 1.3746

Cumulative all_output  Averages over 64 samples:
Av

  return self.preprocess(images, **kwargs)


Selected CLIP input :  black

Metrics for all_output knife_002682.jpg:
 all_output Current - KLD: 0.2292 | SIM: 0.7697 | NSS: 3.3138

Cumulative all_output  Averages over 66 samples:
Average - KLD: 1.0224 | SIM: 0.4805 | NSS: 1.3916

[66] Processing: stick - knife
Selected CLIP input : knife

Metrics for all_output knife_001072.jpg:
 all_output Current - KLD: 1.5181 | SIM: 0.2527 | NSS: 1.9174

Cumulative all_output  Averages over 67 samples:
Average - KLD: 1.0298 | SIM: 0.4771 | NSS: 1.3995

[67] Processing: wash - knife
Selected CLIP input : knife

Metrics for all_output knife_002720.jpg:
 all_output Current - KLD: 0.7094 | SIM: 0.5390 | NSS: 2.7525

Cumulative all_output  Averages over 68 samples:
Average - KLD: 1.0251 | SIM: 0.4780 | NSS: 1.4194

[68] Processing: type_on - laptop
Selected CLIP input :  keyboard

Metrics for all_output laptop_000585.jpg:
 all_output Current - KLD: 0.4968 | SIM: 0.5994 | NSS: 1.9287

Cumulative all_output  Averages over 69 samples:
Average - KLD: 1.0

  return self.preprocess(images, **kwargs)


Selected CLIP input :  rear

Metrics for all_output motorcycle_003541.jpg:
 all_output Current - KLD: 2.2713 | SIM: 0.1649 | NSS: 0.1626

Cumulative all_output  Averages over 71 samples:
Average - KLD: 1.0405 | SIM: 0.4729 | NSS: 1.3939

[71] Processing: ride - motorcycle
Selected CLIP input :  the

Metrics for all_output motorcycle_002198.jpg:
 all_output Current - KLD: 0.8268 | SIM: 0.4984 | NSS: 1.7397

Cumulative all_output  Averages over 72 samples:
Average - KLD: 1.0375 | SIM: 0.4733 | NSS: 1.3987

[72] Processing: sit_on - motorcycle
Selected CLIP input :  black

Metrics for all_output motorcycle_000837.jpg:
 all_output Current - KLD: 1.6376 | SIM: 0.2258 | NSS: 1.8426

Cumulative all_output  Averages over 73 samples:
Average - KLD: 1.0457 | SIM: 0.4699 | NSS: 1.4048

[73] Processing: cut - orange
Selected CLIP input : orange

Metrics for all_output orange_001193.jpg:
 all_output Current - KLD: 0.7380 | SIM: 0.6147 | NSS: 1.3356

Cumulative all_output  Averages over 74 samples:


  return self.preprocess(images, **kwargs)


Selected CLIP input : orange

Metrics for all_output orange_001193.jpg:
 all_output Current - KLD: 0.6967 | SIM: 0.6197 | NSS: 1.3296

Cumulative all_output  Averages over 76 samples:
Average - KLD: 1.0357 | SIM: 0.4748 | NSS: 1.3990

[76] Processing: wash - orange
Selected CLIP input : orange

Metrics for all_output orange_001193.jpg:
 all_output Current - KLD: 0.7213 | SIM: 0.6127 | NSS: 1.2241

Cumulative all_output  Averages over 77 samples:
Average - KLD: 1.0316 | SIM: 0.4766 | NSS: 1.3968

[77] Processing: open - oven
Selected CLIP input : oven

Metrics for all_output oven_001370.jpg:
 all_output Current - KLD: 0.9908 | SIM: 0.4124 | NSS: 1.5617

Cumulative all_output  Averages over 78 samples:
Average - KLD: 1.0311 | SIM: 0.4758 | NSS: 1.3989

[78] Processing: write - pen
Selected CLIP input : pen

Metrics for all_output pen_003590.jpg:
 all_output Current - KLD: 1.1512 | SIM: 0.3537 | NSS: 1.9244

Cumulative all_output  Averages over 79 samples:
Average - KLD: 1.0326 | SIM: 0.4

  return self.preprocess(images, **kwargs)


Selected CLIP input :  Ever

Metrics for all_output punching_bag_001639.jpg:
 all_output Current - KLD: 0.8681 | SIM: 0.4437 | NSS: 0.5156

Cumulative all_output  Averages over 81 samples:
Average - KLD: 1.0235 | SIM: 0.4758 | NSS: 1.3847

[81] Processing: open - refrigerator
Selected CLIP input :  water

Metrics for all_output refrigerator_002162.jpg:
 all_output Current - KLD: 2.1990 | SIM: 0.2202 | NSS: 0.1886

Cumulative all_output  Averages over 82 samples:
Average - KLD: 1.0378 | SIM: 0.4727 | NSS: 1.3701

[82] Processing: catch - rugby_ball
Selected CLIP input :  green

Metrics for all_output rugby_ball_003522.jpg:
 all_output Current - KLD: 1.5723 | SIM: 0.3857 | NSS: 0.1631

Cumulative all_output  Averages over 83 samples:
Average - KLD: 1.0442 | SIM: 0.4716 | NSS: 1.3555

[83] Processing: kick - rugby_ball
Selected CLIP input :  the

Metrics for all_output rugby_ball_002080.jpg:
 all_output Current - KLD: 0.2183 | SIM: 0.7638 | NSS: 0.9421

Cumulative all_output  Averages ove

  return self.preprocess(images, **kwargs)


Selected CLIP input :  the

Metrics for all_output scissors_002479.jpg:
 all_output Current - KLD: 1.3428 | SIM: 0.2877 | NSS: 1.5040

Cumulative all_output  Averages over 86 samples:
Average - KLD: 1.0290 | SIM: 0.4759 | NSS: 1.3472

[86] Processing: hold - scissors
Selected CLIP input :  the

Metrics for all_output scissors_002479.jpg:
 all_output Current - KLD: 0.7391 | SIM: 0.4906 | NSS: 1.3736

Cumulative all_output  Averages over 87 samples:
Average - KLD: 1.0256 | SIM: 0.4761 | NSS: 1.3475

[87] Processing: carry - skateboard
Selected CLIP input : skateboard

Metrics for all_output skateboard_002668.jpg:
 all_output Current - KLD: 0.1871 | SIM: 0.7725 | NSS: 0.8932

Cumulative all_output  Averages over 88 samples:
Average - KLD: 1.0161 | SIM: 0.4795 | NSS: 1.3424

[88] Processing: hold - skateboard
Selected CLIP input : deck

Metrics for all_output skateboard_002387.jpg:
 all_output Current - KLD: 0.7524 | SIM: 0.5636 | NSS: 0.9504

Cumulative all_output  Averages over 89 sample

  return self.preprocess(images, **kwargs)


Selected CLIP input :  galaxy

Metrics for all_output skateboard_001460.jpg:
 all_output Current - KLD: 0.1165 | SIM: 0.8346 | NSS: 1.1116

Cumulative all_output  Averages over 91 samples:
Average - KLD: 1.0039 | SIM: 0.4833 | NSS: 1.3337

[91] Processing: carry - skis
Selected CLIP input : skis

Metrics for all_output skis_002829.jpg:
 all_output Current - KLD: 2.8446 | SIM: 0.0861 | NSS: -0.2812

Cumulative all_output  Averages over 92 samples:
Average - KLD: 1.0239 | SIM: 0.4790 | NSS: 1.3162

[92] Processing: hold - skis
Selected CLIP input : skis

Metrics for all_output skis_001357.jpg:
 all_output Current - KLD: 2.2711 | SIM: 0.1481 | NSS: 0.5846

Cumulative all_output  Averages over 93 samples:
Average - KLD: 1.0373 | SIM: 0.4754 | NSS: 1.3083

[93] Processing: jump - skis
Selected CLIP input :  length

Metrics for all_output skis_002829.jpg:
 all_output Current - KLD: 2.1363 | SIM: 0.1623 | NSS: 0.0535

Cumulative all_output  Averages over 94 samples:
Average - KLD: 1.0490 | SI

  return self.preprocess(images, **kwargs)


Selected CLIP input : snowboard

Metrics for all_output snowboard_001325.jpg:
 all_output Current - KLD: 1.3215 | SIM: 0.2977 | NSS: 2.1932

Cumulative all_output  Averages over 96 samples:
Average - KLD: 1.0883 | SIM: 0.4656 | NSS: 1.2871

[96] Processing: hold - snowboard
Selected CLIP input :  top

Metrics for all_output snowboard_001704.jpg:
 all_output Current - KLD: 1.8384 | SIM: 0.2108 | NSS: -0.0188

Cumulative all_output  Averages over 97 samples:
Average - KLD: 1.0960 | SIM: 0.4630 | NSS: 1.2737

[97] Processing: jump - snowboard
Selected CLIP input :  central

Metrics for all_output snowboard_001704.jpg:
 all_output Current - KLD: 1.0310 | SIM: 0.4001 | NSS: 0.2159

Cumulative all_output  Averages over 98 samples:
Average - KLD: 1.0954 | SIM: 0.4623 | NSS: 1.2629

[98] Processing: catch - soccer_ball
Selected CLIP input :  the

Metrics for all_output soccer_ball_003333.jpg:
 all_output Current - KLD: 0.0785 | SIM: 0.8688 | NSS: 1.3501

Cumulative all_output  Averages over 99

  return self.preprocess(images, **kwargs)


Selected CLIP input : able

Metrics for all_output suitcase_002998.jpg:
 all_output Current - KLD: 1.2728 | SIM: 0.3435 | NSS: 4.4482

Cumulative all_output  Averages over 101 samples:
Average - KLD: 1.0772 | SIM: 0.4691 | NSS: 1.3054

[101] Processing: hold - suitcase
Selected CLIP input :  extended

Metrics for all_output suitcase_003687.jpg:
 all_output Current - KLD: 1.2926 | SIM: 0.3381 | NSS: 2.0076

Cumulative all_output  Averages over 102 samples:
Average - KLD: 1.0793 | SIM: 0.4678 | NSS: 1.3123

[102] Processing: open - suitcase
Selected CLIP input : suitcase

Metrics for all_output suitcase_000520.jpg:
 all_output Current - KLD: 2.0507 | SIM: 0.1789 | NSS: 0.1128

Cumulative all_output  Averages over 103 samples:
Average - KLD: 1.0888 | SIM: 0.4650 | NSS: 1.3006

[103] Processing: pack - suitcase
Selected CLIP input : suitcase

Metrics for all_output suitcase_002212.jpg:
 all_output Current - KLD: 0.8519 | SIM: 0.4424 | NSS: 1.0431

Cumulative all_output  Averages over 104 s

  return self.preprocess(images, **kwargs)


Selected CLIP input : surfboard

Metrics for all_output surfboard_002422.jpg:
 all_output Current - KLD: 3.0310 | SIM: 0.1017 | NSS: -0.2487

Cumulative all_output  Averages over 106 samples:
Average - KLD: 1.1011 | SIM: 0.4622 | NSS: 1.3011

[106] Processing: hold - surfboard
Selected CLIP input : surfboard

Metrics for all_output surfboard_002631.jpg:
 all_output Current - KLD: 0.5616 | SIM: 0.5703 | NSS: 2.9427

Cumulative all_output  Averages over 107 samples:
Average - KLD: 1.0961 | SIM: 0.4632 | NSS: 1.3164

[107] Processing: jump - surfboard
Selected CLIP input : surfboard

Metrics for all_output surfboard_000658.jpg:
 all_output Current - KLD: 0.8358 | SIM: 0.4597 | NSS: 0.4650

Cumulative all_output  Averages over 108 samples:
Average - KLD: 1.0937 | SIM: 0.4632 | NSS: 1.3085

[108] Processing: lie_on - surfboard
Selected CLIP input : surfboard

Metrics for all_output surfboard_000221.jpg:
 all_output Current - KLD: 0.0646 | SIM: 0.8901 | NSS: 2.3054

Cumulative all_output  Av

  return self.preprocess(images, **kwargs)


Selected CLIP input : tennis_racket

Metrics for all_output tennis_racket_002268.jpg:
 all_output Current - KLD: 3.9741 | SIM: 0.0443 | NSS: -0.5188

Cumulative all_output  Averages over 111 samples:
Average - KLD: 1.1020 | SIM: 0.4660 | NSS: 1.2920

[111] Processing: hold - tennis_racket
Selected CLIP input :  handle

Metrics for all_output tennis_racket_001785.jpg:
 all_output Current - KLD: 0.3827 | SIM: 0.6561 | NSS: 3.3551

Cumulative all_output  Averages over 112 samples:
Average - KLD: 1.0956 | SIM: 0.4677 | NSS: 1.3104

[112] Processing: swing - tennis_racket
Selected CLIP input :  handle

Metrics for all_output tennis_racket_003066.jpg:
 all_output Current - KLD: 0.8158 | SIM: 0.4790 | NSS: 2.0501

Cumulative all_output  Averages over 113 samples:
Average - KLD: 1.0931 | SIM: 0.4678 | NSS: 1.3170

[113] Processing: brush_with - toothbrush
Selected CLIP input : toothbrush

Metrics for all_output toothbrush_001764.jpg:
 all_output Current - KLD: 0.9293 | SIM: 0.4062 | NSS: 3.469

  return self.preprocess(images, **kwargs)


Selected CLIP input :  the

Metrics for all_output toothbrush_001991.jpg:
 all_output Current - KLD: 1.6829 | SIM: 0.2252 | NSS: 1.0944

Cumulative all_output  Averages over 116 samples:
Average - KLD: 1.1007 | SIM: 0.4634 | NSS: 1.3346

[116] Processing: drink_with - wine_glass
Selected CLIP input :  bowl

Metrics for all_output wine_glass_003343.jpg:
 all_output Current - KLD: 1.5753 | SIM: 0.2406 | NSS: 0.9172

Cumulative all_output  Averages over 117 samples:
Average - KLD: 1.1047 | SIM: 0.4615 | NSS: 1.3310

[117] Processing: hold - wine_glass
Selected CLIP input : wine_glass

Metrics for all_output wine_glass_002374.jpg:
 all_output Current - KLD: 0.9523 | SIM: 0.4141 | NSS: 1.3138

Cumulative all_output  Averages over 118 samples:
Average - KLD: 1.1034 | SIM: 0.4611 | NSS: 1.3309

[118] Processing: pour - wine_glass
Selected CLIP input : wine_glass

Metrics for all_output wine_glass_000186.jpg:
 all_output Current - KLD: 0.9956 | SIM: 0.4118 | NSS: 1.6109

Cumulative all_output 

In [None]:
#  TOP1 : KLD: 1.1065 | SIM: 0.4589 | NSS: 1.3299
# All Token : Average - KLD: 1.0833 | SIM: 0.4528 | NSS: 1.3326

(24, 24)

In [25]:
df_analy.to_csv("results_top1_following_gamma.csv",sep="$")

In [6]:
df_analy.columns

Index(['object', 'action', 'filename', 'output_sentence', 'top_token_text',
       'following_text', 'clip_input', 'KLD', 'SIM', 'NSS'],
      dtype='object')

In [6]:
df_analy[df_analy['clip_input'] != df_analy['object']]

Unnamed: 0,object,action,filename,output_sentence,top_token_text,following_text,clip_input,KLD,SIM,NSS
1,apple,eat,apple_001541.jpg,"Based on the first image, the entire flesh of ...",the,apple,the apple,0.048497,0.878917,1.272056
3,axe,hit,axe_000961.jpg,"Based on the first image, the sharp, curved bl...",curved,blade,curved blade,2.671174,0.101751,-0.233426
4,axe,hold,axe_001552.jpg,"Based on the first image, the handle of the ax...",red,and,red and,0.207951,0.750536,2.564849
5,badminton_racket,hold,badminton_racket_002255.jpg,"Based on the first image, the handle (or grip)...",handle,(,handle (,0.832631,0.456883,2.228246
9,banana,peel,banana_000480.jpg,"Based on the first image, the outer yellow ski...",yellow,skin,yellow skin,1.41659,0.31027,0.906147
15,bed,lie_on,bed_002880.jpg,"Based on the first image, when people perform ...",gray,bedding,gray bedding,0.217538,0.750239,2.923309
16,bed,sit_on,bed_003622.jpg,"Based on the first image, when people perform ...",blue,du,blue du,0.57061,0.578344,1.676782
17,bench,lie_on,bench_003727.jpg,"Based on the first image, when people perform ...",by,legs,by legs,1.35145,0.378122,0.763257
19,bicycle,push,bicycle_002432.jpg,"Based on the first image, when people perform ...",a,bicycle,a bicycle,2.785041,0.102958,-0.034037
20,bicycle,ride,bicycle_003046.jpg,"Based on the first image, the part of the bicy...",seat,",","seat ,",0.920077,0.479077,1.575963


In [12]:
df_analy[df_analy['clip_input'] == df_analy['object']][['KLD','SIM','NSS']].mean()

KLD    0.929405
SIM    0.504299
NSS    1.349262
dtype: float64

In [None]:
df_analy[df_analy['clip_input'] != df_analy['object']][['KLD','SIM','NSS']].mean()

KLD    1.222106
SIM    0.443117
NSS    1.227378
dtype: float64

In [None]:
len(df_analy[df_analy['clip_input'] != df_analy['object']])

51

In [None]:
metrics_tracker_alloutput = MetricsTracker(name="all_output")
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

# 저장할 디렉토리 생성
Layername = "layerhead_deep"
output_dir = f"./output_{Layername}"  # 디렉토리 이름 변경 (구분 위해)s
os.makedirs(output_dir, exist_ok=True)

# Contrastive Subtraction 강도 설정 (0.5 ~ 1.0 추천)

POS_ALPHA = 0

for i in range(1):
    i += 1
    pkl_path = f"output_results/attention_result_full_output_32B_{i}.pkl"
    if not os.path.exists(pkl_path):
        continue
        
    df_output = pd.read_pickle(pkl_path)

    row = df_output.iloc[1]


    object_name = row['object']
    action = row['action']
    filename = row['filename']
    output_description = row['output_sentence']
    output_attentions = row['output_attentions']
    
    file_name_real = f"{AGD20K_PATH}/Seen/testset/egocentric/{action}/{object_name}/{filename}"
    gt_path = f"{AGD20K_PATH}/Seen/testset/GT/{action}/{object_name}/{filename.split('.')[0]}.png"
        
    # 이미지 로드
    if not os.path.exists(file_name_real):
        print(f"Image not found: {file_name_real}")
        continue

    orig_img = cv2.imread(file_name_real)
    orig_img = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
    h, w, _ = orig_img.shape

    print(f"[Processing: {action} - {object_name}")

    # --- 1. CLIPSeg Mask 생성 ---

    clip_heatmap = get_clipseg_heatmap(
        file_name_real,
        clip_model,
        clip_processor,
        object_name,
    )

    # CLIPSeg 결과를 31x31로 리사이즈
    clip_heatmap_resized = cv2.resize(clip_heatmap, (31, 31), interpolation=cv2.INTER_LINEAR)
    clip_binary_mask = (clip_heatmap_resized > 0.15).astype(np.float32) # 필요시 사용

    # --- 2. [변경] Contrastive Attention Map 계산 ---
    token_scores = []
    token_idx = 0
    for token in output_attentions:
        # 토큰별 히트맵 초기화
        token_heatmap = np.zeros((31, 31), dtype=np.float32)
        token_head_count = 0
        
        attention_value = token['attentions']
        decoded_str = token['token_str'] # 디버깅용

        for each_attention in attention_value:
            layer = each_attention['layer']
            head = each_attention['head']
            
            # # 사용자가 설정한 특정 레이어 필터링 (여기선 Layer 0 유지)
            # if each_attention['layer'] != 0:
            if 1==1: # (layer <=45) and (layer >= 20): #1==1: # (layer == 26) : #and( head ==20)) : #or ((layer == 24) and( head ==31)):
                
                token_heatmap += each_attention['heatmap']
                token_head_count += 1

        
        # 해당 레이어의 헤드가 하나도 없으면 스킵
        if token_head_count == 0:
            continue

        # Visual Dependency Score (S_img) 계산: 맵의 총합
        s_img = token_heatmap.sum()

        # 리스트에 저장
        token_scores.append({
            "token": decoded_str,
            "token_idx" : token_idx,
            "score": s_img,
            "heatmap": token_heatmap, 
            "count": token_head_count
        })
        token_idx +=1
    # 예외 처리: 토큰이 없을 경우
    if len(token_scores) == 0:
        print("No valid tokens found.")
        continue

    # 정렬 (Score 기준 오름차순)
    sorted_tokens = sorted(token_scores, key=lambda x: x['score'])
    
    # Top 5 (Signal) & Bottom 5 (Noise) 선정
    # 토큰 개수가 10개 미만일 경우 처리
    num_select = min(1, len(sorted_tokens) // 2)
    if num_select < 1: num_select = 1 # 최소 1개

    bottom_tokens = sorted_tokens[:num_select]       # Noise (기능어, 배경 등)
    top_tokens = sorted_tokens[-num_select:][::-1]   # Signal (명사, 핵심어)
    ## 꺼꾸로 last 뽑기


    # Signal Map (Positive) 평균
    pos_map = np.sum([t['heatmap'] for t in top_tokens], axis=0)
    pos_map /= len(top_tokens)
    top_token_idx  = top_tokens[-1]['token_idx']
    top_token_text  = top_tokens[-1]['token']

    # --- 모든 레이어/헤드 어텐션 저장 로직 ---

    # 1. 대상 토큰 데이터 추출 (Top 1 토큰)
    target_token_info = top_tokens[-1] 
    target_token_str = target_token_info['token'].strip()
    target_token_idx = target_token_info['token_idx']


    # ... (위쪽 코드: target_token_idx, target_token_str 추출 부분) ...
    
    print(f"Generating Layer-wise attention maps for token: '{target_token_str}' (Idx: {target_token_idx})")

    # --- [추가] 모든 Layer/Head Attention 시각화 저장 로직 ---

    # 1. 저장할 서브 디렉토리 생성
    # 파일명에 겹치지 않게 토큰 인덱스와 텍스트를 포함
    safe_token_str = "".join([c if c.isalnum() else "_" for c in target_token_str])
    vis_save_dir = os.path.join(output_dir, f"viz_token_{target_token_idx}_{safe_token_str}")
    os.makedirs(vis_save_dir, exist_ok=True)

    # 2. 해당 토큰의 모든 어텐션 데이터 수집 및 레이어별 분류
    # output_attentions[target_token_idx]는 해당 토큰 위치의 정보라고 가정
    # 만약 output_attentions가 토큰들의 리스트라면 아래와 같이 접근
    target_token_data = output_attentions[target_token_idx]
    
    # 레이어별로 Head 정보를 모음: { layer_idx: [ {head: h, heatmap: map}, ... ] }
    layer_wise_attentions = {}
    
    for attn_item in target_token_data['attentions']:
        ly = attn_item['layer']
        hd = attn_item['head']
        hm = attn_item['heatmap']
        
        if ly not in layer_wise_attentions:
            layer_wise_attentions[ly] = []
        layer_wise_attentions[ly].append({'head': hd, 'heatmap': hm})

 # ... (이전 데이터 수집 단계 코드는 동일) ...

# ... (이전 코드: target_token_str 추출, 디렉토리 생성 등) ...

    # [추가 1] 데이터를 모을 리스트 초기화
    att_data_list = []

    # 3. 레이어별 루프를 돌며 이미지 생성 및 데이터 수집
    sorted_layers = sorted(layer_wise_attentions.keys())
    
    for ly in tqdm(sorted_layers, desc=f"Saving Layers for '{target_token_str}'"):
        heads_data = layer_wise_attentions[ly]
        
        # Head 번호 순서대로 정렬
        heads_data.sort(key=lambda x: x['head'])
        
        num_heads = len(heads_data)
        if num_heads == 0: continue

        # 해당 레이어의 전체 Attention Score 합계 계산
        layer_total_score = sum([h['heatmap'].sum() for h in heads_data])

        # 격자 크기 계산
        grid_size = int(np.ceil(np.sqrt(num_heads)))
        
        # 캔버스 생성
        fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
        fig.suptitle(
            f"Token: '{target_token_str}' (Idx: {target_token_idx}) - Layer {ly}\n"
            f"Layer Total Attention Sum: {layer_total_score:.4f}", 
            fontsize=24
        )
        
        if isinstance(axes, np.ndarray):
            axes_flat = axes.flatten()
        else:
            axes_flat = [axes]
            
        # 각 Head별 처리
        for i, ax in enumerate(axes_flat):
            if i < num_heads:
                head_info = heads_data[i]
                h_idx = head_info['head']
                h_map = head_info['heatmap']
                
                # 개별 Head Score 계산
                head_score = h_map.sum()
                
                # [추가 2] 리스트에 데이터 추가 (Layer, Head, Attention Sum)
                att_data_list.append({
                    'Layer': ly,
                    'Head': h_idx,
                    'Attention_Sum': head_score,
                    'Layer_Total_Sum': layer_total_score  # 필요 시 레이어 총합도 함께 저장
                })

                # 시각화
                im = ax.imshow(h_map, cmap='viridis', interpolation='nearest')
                ax.set_title(f"Head {h_idx}\nSum: {head_score:.2f}", fontsize=10)
                ax.axis('off')
            else:
                ax.axis('off')
        
        plt.tight_layout(rect=[0, 0.03, 1, 0.93])
        save_path = os.path.join(vis_save_dir, f"layer_{ly:03d}.png")
        plt.savefig(save_path)
        plt.close(fig) 

    # [추가 3] DataFrame 생성 및 저장
    df_att = pd.DataFrame(att_data_list)
    
    # 보기 좋게 컬럼 순서 정렬 (Layer -> Head -> Sum)
    df_att = df_att[['Layer', 'Head', 'Attention_Sum', 'Layer_Total_Sum']]
    
    # CSV 파일로 저장
    csv_path = os.path.join(vis_save_dir, "attention_sums.csv")
    df_att.to_csv(csv_path, index=False)

    print(f"Saved all layer maps and DataFrame to: {vis_save_dir}")
    print("DataFrame Head 미리보기:")
    print(df_att.head())

[Processing: eat - apple


  return self.preprocess(images, **kwargs)


Generating Layer-wise attention maps for token: 'flesh' (Idx: 1)


Saving Layers for 'flesh': 100%|██████████| 64/64 [01:59<00:00,  1.87s/it]

Saved all layer maps and DataFrame to: ./output_layerhead_deep/viz_token_1_flesh
DataFrame Head 미리보기:
   Layer  Head  Attention_Sum  Layer_Total_Sum
0      0     0       0.019457         2.554762
1      0     1       0.070488         2.554762
2      0     2       0.000568         2.554762
3      0     3       0.040754         2.554762
4      0     4       0.035283         2.554762





In [None]:
df_att.groupby(['Layer'])['Attention_Sum'].sum().sort_values()

Layer
20    0.376303
23    0.566991
1     0.760726
10    0.854804
9     0.889243
        ...   
50    8.005047
45    8.019423
17    8.284877
46    8.924697
41    9.288528
Name: Attention_Sum, Length: 64, dtype: float32

In [None]:
df_att[df_att['Layer']==20].sort_values("Attention_Sum")

Unnamed: 0,Layer,Head,Attention_Sum,Layer_Total_Sum
1302,20,22,1.028297e-08,0.376303
1338,20,58,1.797992e-07,0.376303
1304,20,24,2.148592e-07,0.376303
1307,20,27,2.576361e-07,0.376303
1291,20,11,3.766697e-07,0.376303
...,...,...,...,...
1333,20,53,2.764578e-02,0.376303
1311,20,31,3.363384e-02,0.376303
1342,20,62,3.683291e-02,0.376303
1325,20,45,4.368176e-02,0.376303


In [None]:
df_att

Unnamed: 0,Layer,Head,Attention_Sum,Layer_Total_Sum
0,0,0,0.019457,2.554762
1,0,1,0.070488,2.554762
2,0,2,0.000568,2.554762
3,0,3,0.040754,2.554762
4,0,4,0.035283,2.554762
...,...,...,...,...
4091,63,59,0.052509,2.854838
4092,63,60,0.032250,2.854838
4093,63,61,0.033217,2.854838
4094,63,62,0.092705,2.854838
