In [1]:
import pandas as pd
import numpy as np

import numpy as np
import matplotlib.pyplot as plt
import cv2
import os


from scipy.stats import pearsonr
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation

processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
clip_model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from config import AGD20K_PATH, model_name

from VLM_model_dot_relative import MetricsTracker
from file_managing import (
    load_selected_samples,
    get_actual_path,
    get_gt_path,
    load_ground_truth,
    prompt_dict_obj,
    get_clipseg_heatmap,
    calculate_metrics
)

def min_max_normalize(arr):
    denom = arr.max() - arr.min()
    if denom == 0:
        return np.zeros_like(arr)
    return (arr - arr.min()) / (denom + 1e-8)

metrics_tracker_lastinput = MetricsTracker(name="all_output")

AGD20K_PATH

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


'/home/DATA/AGD20K'

In [2]:
df_output = pd.read_pickle(f"output_results/attention_result_32B.pkl")
df_output.columns

Index(['action', 'object', 'filename', 'description', 's_img'], dtype='object')

In [11]:
# [{'layer': 0,
#   'head': 0,
#   'S_img': 0.09716796875,
#   'heatmap': array([[3.31878662e-04, 5.34057617e-04, 4.65393066e-04, 3.81469727e-04,
#           2.13623047e-04, 2.82526016e-05, 4.88758087e-05, 1.28173828e-03,
#           1.13010406e-04, 4.52995300e-05, 2.63750553e-06, 5.41210175e-05,
#           9.48905945e-05, 1.61170959e-04, 1.51395798e-05, 3.36170197e-05,
#           7.39097595e-05, 1.99317932e-04, 1.74045563e-05, 7.82012939e-05,
#           1.21593475e-05, 3.05175781e-05, 1.29342079e-05, 1.66893005e-05,
#           1.00135803e-04, 2.63214111e-04, 1.09195709e-04, 1.37329102e-03,
#           3.69548798e-05, 1.66893005e-04, 3.62396240e-05],
#          [9.35792923e-06, 4.76837158e-06, 5.93066216e-06, 8.22544098e-06,

In [3]:
# 저장할 디렉토리 생성
output_dir = "./input_last_L26_clipseg"
os.makedirs(output_dir, exist_ok=True)

for idx, row in df_output.iterrows():
    # 1. 빈 히트맵 초기화 (31x31)
    sum_heatmap = np.zeros((31, 31), dtype=np.float32)

    object_name = row['object']
    action = row['action']
    filename = row['filename']
    attention_value = row['s_img']
    description = row['description']
    
    file_name_real = f"{AGD20K_PATH}/Seen/testset/egocentric/{action}/{object_name}/{filename}"
    gt_path = f"{AGD20K_PATH}/Seen/testset/GT/{action}/{object_name}/{filename.split('.')[0]}.png"
        
    orig_img = cv2.imread(file_name_real)
    orig_img = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
    h, w, _ = orig_img.shape

    print(f"[{idx}] Processing: {action} - {object_name}")

    # --- CLIPSeg Mask 생성 ---
    clip_heatmap = get_clipseg_heatmap(
        file_name_real,
        clip_model,
        processor,
        object_name,
    )
    # CLIPSeg 결과를 31x31로 리사이즈 (평균 히트맵과 연산을 위해 크기 일치 필요)
    clip_heatmap_resized = cv2.resize(clip_heatmap, (31, 31), interpolation=cv2.INTER_LINEAR)
    clip_binary_mask = (clip_heatmap_resized > 0.15).astype(np.float32)

    # 2. 모든 어텐션 히트맵 합산
    total_heads_count = 0

    for each_attention in attention_value:
        layer = each_attention['layer']
        head = each_attention['head']
        # if ((layer == 26) and( head ==20)) or ((layer == 24) and( head ==31)):
        if layer ==26:
            sum_heatmap += each_attention['heatmap']
            # minmax_each = min_max_normalize(each_attention['heatmap'])
            # sum_heatmap += minmax_each
            total_heads_count += 1
        
    # 3. 평균 및 정규화
    average_heatmap = sum_heatmap / (total_heads_count + 1e-8)
    h_min, h_max = average_heatmap.min(), average_heatmap.max()
    avg_norm = (average_heatmap - h_min) / (h_max - h_min + 1e-8)

    # 마스킹 적용 (Attention * Mask)
    # avg_norm_masked = avg_norm * clip_binary_mask
    avg_norm_cliped = avg_norm * clip_heatmap_resized

    avg_norm_cliped_rescaled = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)
    # 2. 커널 사이즈 결정 (이미지 크기의 약 5~10% 추천)
    sig = min(w, h) * 0.05 
    k_val = int(sig * 3) * 2 + 1 # 시그마에 따른 적절한 홀수 커널 사이즈 계산
    kernel_size = (k_val, k_val)

    # 3. 블러 적용 (float32 상태에서 수행)
    blur_map = cv2.GaussianBlur(avg_norm_cliped_rescaled, kernel_size, sig)

    # 4. 블러 후 다시 정규화 (값이 낮아질 수 있으므로)
    blur_map = min_max_normalize(blur_map)

    # 5. 시각화 방식 선택 (택 1)
    # 방법 A: matplotlib의 cmap 사용 (추천: 코드가 깔끔함)
    avg_norm_cliped_blur = blur_map

    ## resize
    avg_norm_cliped = cv2.resize(avg_norm_cliped, (w, h), interpolation=cv2.INTER_LINEAR)

    # 4. 시각화 (서브플롯 5개)
    fig, axes = plt.subplots(1, 6, figsize=(22, 6)) # 가로로 길게 설정
    

    ## GT metrics
    gt_map = load_ground_truth(gt_path)
    if gt_map is not None:
        metrics_dino  = calculate_metrics(avg_norm_cliped_blur, gt_map)
        metrics_tracker_lastinput.update(metrics_dino)
    else:
        print("NO GT!!!")
        continue
    metrics_tracker_lastinput.print_metrics(metrics_dino, filename)
    metrics_text = f"[{object_name} {action} {filename}]  KLD: {metrics_dino['KLD']:.4f} | SIM: {metrics_dino['SIM']:.4f} | NSS: {metrics_dino['NSS']:.4f}"



    # --- [핵심 추가] 전체 제목 설정 ---
    # object_name / action / filename 표시
    main_title = f"Last input Token - Object: {object_name}  |  Action: {action}  |  File: {filename} \n {metrics_text} \n {description}"

    fig.suptitle(main_title, fontsize=16, fontweight='bold', y=0.95)

    # (1) 원본 이미지
    img_real = cv2.imread(file_name_real)
    img_real = cv2.cvtColor(img_real, cv2.COLOR_BGR2RGB)
    axes[0].imshow(img_real)
    axes[0].set_title(f"Original\n({object_name})")
    axes[0].axis('off')

    # (2) 전체 평균 Attention
    im1 = axes[1].imshow(avg_norm, cmap='jet', interpolation='bilinear')
    axes[1].set_title("Avg Attention")
    axes[1].axis('off')
    plt.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04)

    # (3) CLIPSeg Binary Mask (0 or 1)
    axes[2].imshow(clip_heatmap_resized, cmap='gray')
    axes[2].set_title("clip_heatmap_resized")
    axes[2].axis('off')

    # (4) Masked Attention (Attention x Mask)
    im3 = axes[3].imshow(avg_norm_cliped, cmap='jet', interpolation='bilinear')
    axes[3].set_title("Hadamard\nAttention")
    axes[3].axis('off')
    plt.colorbar(im3, ax=axes[3], fraction=0.046, pad=0.04)


    # (4) Masked Attention (Attention x Mask)
    im3 = axes[4].imshow(avg_norm_cliped_blur, cmap='jet', interpolation='bilinear')
    axes[4].set_title("Blured")
    axes[4].axis('off')
    plt.colorbar(im3, ax=axes[4], fraction=0.046, pad=0.04)



    img_gt = cv2.imread(gt_path)
    img_gt = cv2.cvtColor(img_gt, cv2.COLOR_BGR2RGB)
    axes[5].imshow(img_gt)
    axes[5].set_title("Ground Truth")

    axes[5].axis('off')

    # 5. 파일 저장
    save_path = os.path.join(output_dir, f"{object_name}_{action}_{filename.split('.')[0]}.png")
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches='tight', dpi=150)
    plt.close(fig)


[0] Processing: cut - apple


  return self.preprocess(images, **kwargs)



Metrics for all_output apple_000054.jpg:
 all_output Current - KLD: 0.6917 | SIM: 0.6425 | NSS: 0.7221

Cumulative all_output  Averages over 1 samples:
Average - KLD: 0.6917 | SIM: 0.6425 | NSS: 0.7221

[1] Processing: eat - apple

Metrics for all_output apple_001541.jpg:
 all_output Current - KLD: 0.0660 | SIM: 0.8716 | NSS: 1.2721

Cumulative all_output  Averages over 2 samples:
Average - KLD: 0.3788 | SIM: 0.7571 | NSS: 0.9971

[2] Processing: peel - apple

Metrics for all_output apple_001541.jpg:
 all_output Current - KLD: 0.0660 | SIM: 0.8716 | NSS: 1.2721

Cumulative all_output  Averages over 3 samples:
Average - KLD: 0.2746 | SIM: 0.7953 | NSS: 1.0888

[3] Processing: hit - axe

Metrics for all_output axe_000961.jpg:
 all_output Current - KLD: 1.0308 | SIM: 0.3868 | NSS: 0.7628

Cumulative all_output  Averages over 4 samples:
Average - KLD: 0.4636 | SIM: 0.6931 | NSS: 1.0073

[4] Processing: hold - axe

Metrics for all_output axe_001552.jpg:
 all_output Current - KLD: 0.6996 | 

In [None]:
마지막토큰 모두 Average - KLD: 0.9268 | SIM: 0.4970 | NSS: 1.5135
26/20 인거만하면?? : Average - KLD: 1.0616 | SIM: 0.4745 | NSS: 1.3285
26/20 & 2431 거만하면 : KLD: 1.0722 | SIM: 0.4759 | NSS: 1.3081
L26 only : KLD: 1.0796 | SIM: 0.4607 | NSS: 1.2327