In [None]:
from cli_visualization import HuatuoChatbot
import torch
from PIL import Image
import json
from tqdm import tqdm
from transformers import TextStreamer
import seaborn as sns
import matplotlib.pyplot as plt
import os
from collections import defaultdict
import numpy as np
import random
import cv2

NUM_IMG_TOKENS = 576
PATCHES = 24
SIZE = (336, 336)

bot = HuatuoChatbot("FreedomIntelligence/HuatuoGPT-Vision-7B")

In [None]:
def generate_attention_maps(question, image_path, layers=range(28)):

    general_question = 'Write a general description of the image.'
    prompt = f"{question} Answer the question using a single word or phrase."
    general_prompt = f"{general_question} Answer the question using a single word or phrase."

    model_output, input_ids = bot.inference_with_attention_output(prompt,image_path)
    input_ids = input_ids[0].cpu()
    index = torch.where(input_ids==-200)[0]
    att_maps = np.array([model_output['attentions'][layer][0, :, -1, index:index+NUM_IMG_TOKENS].mean(dim=0).to(torch.float32).detach().cpu().numpy() for layer in layers])

    model_output, input_ids = bot.inference_with_attention_output(general_prompt,image_path)
    input_ids = input_ids[0].cpu()
    index = torch.where(input_ids==-200)[0]
    general_att_maps = np.array([model_output['attentions'][layer][0, :, -1, index:index+NUM_IMG_TOKENS].mean(dim=0).to(torch.float32).detach().cpu().numpy() for layer in layers])

    return att_maps, general_att_maps


def attention_ratio_vectorized(visual_attentions, gt_tokens):

    token_indices = np.array([token[1] * PATCHES + token[0] for token in gt_tokens])
    relevant_attention = np.sum(visual_attentions[..., token_indices], axis=-1)
    average_attention = np.sum(visual_attentions, axis=-1) / NUM_IMG_TOKENS * len(token_indices)

    return relevant_attention / (average_attention + 1e-8)


def js_divergence_vectorized(att_map, gt_tokens, epsilon=1e-8):

    att_map = att_map / (att_map.sum(axis=-1, keepdims=True) + epsilon)

    gt_mask = np.zeros(att_map.shape[-1])
    for token in gt_tokens:
        gt_mask[token[1] * PATCHES + token[0]] = 1
    gt_mask = gt_mask / gt_mask.sum()
    
    m = 0.5 * (att_map + gt_mask) 
    kl_att_m = np.sum(att_map * np.log((att_map + epsilon) / (m + epsilon)), axis=-1)
    kl_gt_m = np.sum(gt_mask * np.log((gt_mask + epsilon) / (m + epsilon)), axis=-1)
    js_div = 0.5 * (kl_att_m + kl_gt_m)

    return js_div


def kl_divergence_vectorized(att_map, gt_tokens):

    att_map = att_map / att_map.sum()

    gt_mask = np.zeros(att_map.shape[-1])
    for token in gt_tokens:
        gt_mask[token[1] * PATCHES + token[0]] = 1
    gt_mask = gt_mask / gt_mask.sum()

    epsilon = 1e-12
    att_map = np.clip(att_map, epsilon, 1)
    gt_mask = np.clip(gt_mask, epsilon, 1)

    return np.sum(gt_mask * np.log(gt_mask / att_map), axis=-1)

### SLAKE Localization

In [None]:
attention_ratios_normalized = []
attention_kl_normalized = []
attention_js_normalized = []
questions = []
input_path = "./VGMED_localization_questions.jsonl"
with open(input_path, "r") as infile:
    for line in infile:
        questions.append(json.loads(line))

for sample in tqdm(questions):
    image_path = os.path.join("../LLaVA/playground/data/Slake1.0/imgs", sample["image"], "source.jpg")
    qs = sample["question"]
    att_maps, general_att_maps = generate_attention_maps(qs, image_path)
    gt_tokens = sample["gt_tokens"]

    att_ratio_normalized = attention_ratio_vectorized(att_maps/general_att_maps, gt_tokens)
    att_kl_normalized = kl_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    att_js_normalized = js_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    
    attention_ratios_normalized.append(att_ratio_normalized)
    attention_kl_normalized.append(att_kl_normalized)
    attention_js_normalized.append(att_js_normalized)


torch.save(attention_ratios_normalized, "attention_measurement/VGMED_LOCAL_attention_ratios_normalized.pt")
torch.save(attention_kl_normalized, "attention_measurement/VGMED_LOCAL_attention_kl_normalized.pt")
torch.save(attention_js_normalized, "attention_measurement/VGMED_LOCAL_attention_js_normalized.pt")

In [None]:
attention_ratios_normalized = torch.load("attention_measurement/VGMED_LOCAL_attention_ratios_normalized.pt")
attention_kl_normalized = torch.load("attention_measurement/VGMED_LOCAL_attention_kl_normalized.pt")
attention_js_normalized = torch.load("attention_measurement/VGMED_LOCAL_attention_js_normalized.pt")

##### Ratio

In [None]:
layer_ratios = np.array(attention_ratios_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention Ratio by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention Ratio", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### KL

In [None]:
layer_ratios = np.array(attention_kl_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention KL Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention KL Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### JS

In [None]:
layer_ratios = np.array(attention_js_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention JS Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention JS Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

### SLAKE Attribute

In [None]:
attention_ratios_normalized = []
attention_kl_normalized = []
attention_js_normalized = []
questions = []
input_path = "./VGMED_attribute_questions.jsonl"
with open(input_path, "r") as infile:
    for line in infile:
        questions.append(json.loads(line))

for sample in tqdm(questions):
    image_path = os.path.join("../LLaVA/playground/data/Slake1.0/imgs", sample["image"], "source.jpg")
    qs = sample["question"]
    att_maps, general_att_maps = generate_attention_maps(qs, image_path)
    gt_tokens = sample["gt_tokens"]

    att_ratio_normalized = attention_ratio_vectorized(att_maps/general_att_maps, gt_tokens)
    att_kl_normalized = kl_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    att_js_normalized = js_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    
    attention_ratios_normalized.append(att_ratio_normalized)
    attention_kl_normalized.append(att_kl_normalized)
    attention_js_normalized.append(att_js_normalized)

torch.save(attention_ratios_normalized, "attention_measurement/VGMED_ATTR_attention_ratios_normalized.pt")
torch.save(attention_kl_normalized, "attention_measurement/VGMED_ATTR_attention_kl_normalized.pt")
torch.save(attention_js_normalized, "attention_measurement/VGMED_ATTR_attention_js_normalized.pt")

In [None]:
attention_ratios_normalized = torch.load("attention_measurement/VGMED_ATTR_attention_ratios_normalized.pt")
attention_kl_normalized = torch.load("attention_measurement/VGMED_ATTR_attention_kl_normalized.pt")
attention_js_normalized = torch.load("attention_measurement/VGMED_ATTR_attention_js_normalized.pt")

##### Ratio

In [None]:
layer_ratios = np.array(attention_ratios_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention Ratio by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention Ratio", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### KL

In [None]:
layer_ratios = np.array(attention_kl_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention KL Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention KL Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### JS

In [None]:
layer_ratios = np.array(attention_js_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention JS Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention JS Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

### COCO Localization

In [None]:
attention_ratios_normalized = []
attention_kl_normalized = []
attention_js_normalized = []
questions = []
input_path = "./COCO_localization_questions_previous.jsonl"
with open(input_path, "r") as infile:
    for line in infile:
        questions.append(json.loads(line))

for sample in tqdm(questions):
    image_path = os.path.join('val2014', f'COCO_val2014_{sample["image"]:012d}.jpg')
    qs = sample["question"]
    att_maps, general_att_maps = generate_attention_maps(qs, image_path)
    gt_tokens = sample["gt_tokens"]

    att_ratio_normalized = attention_ratio_vectorized(att_maps/general_att_maps, gt_tokens)
    att_kl_normalized = kl_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    att_js_normalized = js_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    
    attention_ratios_normalized.append(att_ratio_normalized)
    attention_kl_normalized.append(att_kl_normalized)
    attention_js_normalized.append(att_js_normalized)

torch.save(attention_ratios_normalized, "attention_measurement/COCO_LOCAL_attention_ratios_normalized.pt")
torch.save(attention_kl_normalized, "attention_measurement/COCO_LOCAL_attention_kl_normalized.pt")
torch.save(attention_js_normalized, "attention_measurement/COCO_LOCAL_attention_js_normalized.pt")

In [None]:
attention_ratios_normalized = torch.load("attention_measurement/COCO_LOCAL_attention_ratios_normalized.pt")
attention_kl_normalized = torch.load("attention_measurement/COCO_LOCAL_attention_kl_normalized.pt")
attention_js_normalized = torch.load("attention_measurement/COCO_LOCAL_attention_js_normalized.pt")

##### Ratio

In [None]:
layer_ratios = np.array(attention_ratios_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention Ratio by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention Ratio", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### KL

In [None]:
layer_ratios = np.array(attention_kl_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention KL Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention KL Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### JS

In [None]:
layer_ratios = np.array(attention_js_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention JS Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention JS Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

### COCO Attribute

In [None]:
attention_ratios_normalized = []
attention_kl_normalized = []
attention_js_normalized = []
questions = []
input_path = "./COCO_attribute_questions.jsonl"
with open(input_path, "r") as infile:
    for line in infile:
        questions.append(json.loads(line))

for sample in tqdm(questions):
    image_path = os.path.join('val2014', f'COCO_val2014_{sample["image"]:012d}.jpg')
    qs = sample["question"]
    att_maps, general_att_maps = generate_attention_maps(qs, image_path)
    gt_tokens = sample["gt_tokens"]

    att_ratio_normalized = attention_ratio_vectorized(att_maps/general_att_maps, gt_tokens)
    att_kl_normalized = kl_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    att_js_normalized = js_divergence_vectorized(att_maps/general_att_maps, gt_tokens)
    
    attention_ratios_normalized.append(att_ratio_normalized)
    attention_kl_normalized.append(att_kl_normalized)
    attention_js_normalized.append(att_js_normalized)

torch.save(attention_ratios_normalized, "attention_measurement/COCO_ATTR_attention_ratios_normalized.pt")
torch.save(attention_kl_normalized, "attention_measurement/COCO_ATTR_attention_kl_normalized.pt")
torch.save(attention_js_normalized, "attention_measurement/COCO_ATTR_attention_js_normalized.pt")

In [None]:
attention_ratios_normalized = torch.load("attention_measurement/COCO_ATTR_attention_ratios_normalized.pt")
attention_kl_normalized = torch.load("attention_measurement/COCO_ATTR_attention_kl_normalized.pt")
attention_js_normalized = torch.load("attention_measurement/COCO_ATTR_attention_js_normalized.pt")

##### Ratio

In [None]:
layer_ratios = np.array(attention_ratios_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention Ratio by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention Ratio", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### KL

In [None]:
layer_ratios = np.array(attention_kl_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention KL Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention KL Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()

##### JS

In [None]:
layer_ratios = np.array(attention_js_normalized).mean(axis=0)

plt.figure(figsize=(12, 6))
plt.plot(range(28), layer_ratios, marker='o', linewidth=2)

# Add data labels to each point
for x, y in zip(range(28), layer_ratios):
    plt.text(x, y+0.001, f"{y:.3f}", ha='center', va='bottom', fontsize=9)

# Titles and labels
plt.title("Attention JS Divergence by Layer", fontsize=14)
plt.xlabel("Model Layers", fontsize=12)
plt.ylabel("Average Attention JS Divergence", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)

plt.xticks(range(28))
plt.tight_layout()
plt.show()