In [None]:
import torch
import os
import numpy as np
from PIL import Image
from transformers import FlavaModel, FlavaProcessor


# ===============存储文本、视觉、多模态分支的结果===============
text_q = [] # 各层自注意力的key和query
text_k = []
vision_q = []
vision_k = []
multimodal_q = []
multimodal_k = []

# text_ffn_outputs = [] # 各层FFN的输出
# vision_ffn_outputs = []
# multimodal_ffn_outputs = []

# image_embedding_before_transformer = None   
# text_embedding_before_transformer = None
# multi_embedding_before_transformer = None

hooks = []  # 用于保存钩子以便移除

# 定义钩子函数
def register_hooks(encoder, modality):
    encoder_hooks = []
    num_layers = len(encoder.layer)

    if modality == "text":
        target_q, target_k = text_q, text_k
    elif modality == "vision":
        target_q, target_k = vision_q, vision_k
    elif modality == "multimodal":
        target_q, target_k = multimodal_q, multimodal_k
    else:
        raise ValueError(f"Unknown modality: {modality}")
    
    while len(target_q) < num_layers:
        target_q.append([])
        target_k.append([])

    for layer_idx, layer in enumerate(encoder.layer):
        # 注册 SelfAttention 钩子
        attn = layer.attention.attention
        transpose_for_scores = attn.transpose_for_scores

        # 注册 key 的钩子
        def hook_key(module, input, output, layer_idx=layer_idx):
            key_layer = transpose_for_scores(output)
            key_layer = key_layer.cpu().numpy()
            
            target_k[layer_idx].append(key_layer)

        # 注册 query 的钩子
        def hook_query(module, input, output, layer_idx=layer_idx):
            query_layer = transpose_for_scores(output)
            query_layer = query_layer.cpu().numpy()

            target_q[layer_idx].append(query_layer)
        
        encoder_hooks.append(attn.key.register_forward_hook(hook_key))
        encoder_hooks.append(attn.query.register_forward_hook(hook_query))

    return encoder_hooks


def process_and_save(data_list, save_path, file_name):
    """
        data_list维度是(layer, 1, batch, head, tokens, head_dim),
        而这个第二个维度是无用的
    """
    # 创建保存目录（如果不存在)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        print(f"Directory {save_path} created.")

    # 检查 data_list 是否为空
    if not data_list:
        print(f"No data to save for {file_name}.")
        return
    
    # 合并每个层的numpy数组
    layer_arrays = []
    for layer_data in data_list:
        if layer_data:
            layer_array = np.concatenate([t for t in layer_data], axis=0)
            layer_arrays.append(layer_array)

            del layer_data
        else:
            print(f"Layer {len(layer_arrays)} has no data.")

    # 检查是否有层数据
    if not layer_arrays:
        print(f"No layers to save for {file_name}.")
        return
    
    # 将所有层的数组沿第0维（层维度）堆叠成一个整体张量
    all_layers_array  = np.stack(layer_arrays, axis=0)
    # 保存为 NumPy 文件
    save_file = os.path.join(save_path, f"{file_name}_all_layers.npy")
    np.save(save_file, all_layers_array)
    print(f"Saved {save_file}")
            

if __name__ == "__main__":
    # ==========加载模型和分词器==========
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")                       # 设置设备
    model_name = "./model/facebook/flava-full"     # 模型名称
    model = FlavaModel.from_pretrained(model_name)  # 加载模型
    processor = FlavaProcessor.from_pretrained(model_name)  # 加载分词器

    # 模型加载至设备
    model.to(device)
    print(f"模型已加载至 {device}")

    # 注册hooks
    text_encoder = model.text_model.encoder
    vision_encoder = model.image_model.encoder
    multimodal_encoder = model.multimodal_model.encoder

    hooks += register_hooks(text_encoder, modality="text")
    hooks += register_hooks(vision_encoder, modality="vision")
    hooks += register_hooks(multimodal_encoder, modality="multimodal")


    # ==========准备输入数据==========
    # Text data
    texts = []
    texts_file_path = './stimulate_hcpmovie/text/movie1_modify.txt'
    try:
        with open(texts_file_path, 'r', encoding='utf-8') as file:
            texts = [line.strip() for line in file.readlines()]
            print(f"Loaded {len(texts)} text lines.")
    except FileNotFoundError:
        print(f"File {texts_file_path} not found.")

    # Image data
    image_folder = './stimulate_hcpmovie/image/movie1'
    images_list = os.listdir(image_folder)
    images = [Image.open(os.path.join(image_folder, image)) for image in images_list]
    print(f"Loaded {len(images)} images.")

    input = processor(
        text=texts,
        images=images,
        padding=True,
        return_tensors="pt"
    ).to(device)

    # 前向传播并捕获数据
    with torch.no_grad():
        outputs = model(
            pixel_values=input.pixel_values,
            input_ids=input.input_ids
        )
    
    del model
    torch.cuda.empty_cache()

   # === Process and Save Data ===
    process_and_save(text_q, './q', 'text_q')
    process_and_save(text_k, './k', 'text_k')
    process_and_save(vision_q, './q', 'vision_q')
    process_and_save(vision_k, './k', 'vision_k')
    process_and_save(multimodal_q, './q', 'multimodal_q')
    process_and_save(multimodal_k, './k', 'multimodal_k')


    # ==========清空存储==========
    # 移除所有钩子
    for hook in hooks:
        hook.remove()

    # 清空之前的存储
    del text_q, text_k, vision_q, vision_k, multimodal_q, multimodal_k
    print("Processing complete and memory cleared.")    

In [None]:
import torch
import os
import numpy as np
from PIL import Image
from transformers import FlavaModel, FlavaProcessor


# ===============存储文本、视觉、多模态分支的结果===============
text_q = [] # 各层自注意力的key和query
text_k = []
vision_q = []
vision_k = []
multimodal_q = []
multimodal_k = []

# text_ffn_outputs = [] # 各层FFN的输出
# vision_ffn_outputs = []
# multimodal_ffn_outputs = []

# image_embedding_before_transformer = None   
# text_embedding_before_transformer = None
# multi_embedding_before_transformer = None

hooks = []  # 用于保存钩子以便移除

# 定义钩子函数
def register_hooks(encoder, modality):
    encoder_hooks = []
    num_layers = len(encoder.layer)

    if modality == "text":
        target_q, target_k = text_q, text_k
    elif modality == "vision":
        target_q, target_k = vision_q, vision_k
    elif modality == "multimodal":
        target_q, target_k = multimodal_q, multimodal_k
    else:
        raise ValueError(f"Unknown modality: {modality}")
    
    while len(target_q) < num_layers:
        target_q.append([])
        target_k.append([])

    for layer_idx, layer in enumerate(encoder.layer):
        # 注册 SelfAttention 钩子
        attn = layer.attention.attention
        transpose_for_scores = attn.transpose_for_scores

        # 注册 key 的钩子
        def hook_key(module, input, output, layer_idx=layer_idx):
            key_layer = transpose_for_scores(output)
            key_layer = key_layer.cpu().numpy()
            
            target_k[layer_idx].append(key_layer)

        # 注册 query 的钩子
        def hook_query(module, input, output, layer_idx=layer_idx):
            query_layer = transpose_for_scores(output)
            query_layer = query_layer.cpu().numpy()

            target_q[layer_idx].append(query_layer)
        
        encoder_hooks.append(attn.key.register_forward_hook(hook_key))
        encoder_hooks.append(attn.query.register_forward_hook(hook_query))

    return encoder_hooks


def process_and_save(data_list, save_path, file_name):
    """
        data_list维度是(layer, 1, batch, head, tokens, head_dim),
        而这个第二个维度是无用的
    """
    # 创建保存目录（如果不存在)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        print(f"Directory {save_path} created.")

    # 检查 data_list 是否为空
    if not data_list:
        print(f"No data to save for {file_name}.")
        return
    
    # 合并每个层的numpy数组
    layer_arrays = []
    for layer_data in data_list:
        if layer_data:
            layer_array = np.concatenate([t for t in layer_data], axis=0)
            layer_arrays.append(layer_array)

            del layer_data
        else:
            print(f"Layer {len(layer_arrays)} has no data.")

    # 检查是否有层数据
    if not layer_arrays:
        print(f"No layers to save for {file_name}.")
        return
    
    # 将所有层的数组沿第0维（层维度）堆叠成一个整体张量
    all_layers_array  = np.stack(layer_arrays, axis=0).transpose((1, 0, 2, 3, 4))
    # 保存为 NumPy 文件
    save_file = os.path.join(save_path, f"{file_name}_all_layers.npy")
    np.save(save_file, all_layers_array)
    print(f"Saved {save_file}")
            

if __name__ == "__main__":

    output_dir = 'result_new'
    movie = 'movie2'

    # ==========加载模型和分词器==========
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")                       # 设置设备
    device = "cpu"                    # 设置设备
    model_name = "./model/facebook/flava-full"     # 模型名称
    model = FlavaModel.from_pretrained(model_name)  # 加载模型
    processor = FlavaProcessor.from_pretrained(model_name)  # 加载分词器

    # 模型加载至设备
    model.to(device)
    print(f"模型已加载至 {device}")

    # 注册hooks
    text_encoder = model.text_model.encoder
    vision_encoder = model.image_model.encoder
    multimodal_encoder = model.multimodal_model.encoder

    hooks += register_hooks(text_encoder, modality="text")
    hooks += register_hooks(vision_encoder, modality="vision")
    hooks += register_hooks(multimodal_encoder, modality="multimodal")


    # ==========准备输入数据==========
    # Text data
    texts = []
    texts_file_path = './stimulate_hcpmovie/text/movie1_modify.txt'
    try:
        with open(texts_file_path, 'r', encoding='utf-8') as file:
            texts = [line.strip() for line in file.readlines()]
            print(f"Loaded {len(texts)} text lines.")
    except FileNotFoundError:
        print(f"File {texts_file_path} not found.")

    # Image data
    image_folder = './stimulate_hcpmovie/image/movie1'
    images_list = os.listdir(image_folder)
    images = [Image.open(os.path.join(image_folder, image)) for image in images_list]
    print(f"Loaded {len(images)} images.")

    input = processor(
        text=texts,
        images=images,
        padding=True,
        return_tensors="pt"
    ).to(device)

    # 前向传播并捕获数据
    with torch.no_grad():
        outputs = model(
            pixel_values=input.pixel_values,
            input_ids=input.input_ids
        )
    
    del model
    torch.cuda.empty_cache()

   # === Process and Save Data ===
    process_and_save(text_q, f'{output_dir}/{movie}/q', 'text_q')
    process_and_save(text_k, f'{output_dir}/{movie}/k', 'text_k')
    process_and_save(vision_q, f'{output_dir}/{movie}/q', 'vision_q')
    process_and_save(vision_k, f'{output_dir}/{movie}/k', 'vision_k')
    process_and_save(multimodal_q, f'{output_dir}/{movie}/q', 'multimodal_q')
    process_and_save(multimodal_k, f'{output_dir}/{movie}/k', 'multimodal_k')


    # ==========清空存储==========
    # 移除所有钩子
    for hook in hooks:
        hook.remove()

    # 清空之前的存储
    del text_q, text_k, vision_q, vision_k, multimodal_q, multimodal_k
    print("Processing complete and memory cleared.")    

In [None]:
import numpy as np
import os

movie = "movie2"
modality = 'multimodal'

# 加载Q, K
Q = np.load(f'result_new/{movie}/q/{modality}_q_all_layers.npy')   # (sample, layer, head, sequence, head_dim)
K = np.load(f'result_new/{movie}/k/{modality}_k_all_layers.npy')   # (sample, layer, head, sequence, head_dim)

sample_num = Q.shape[0]    # 获取样本数
layer_num = Q.shape[1] # 获取层数
head_num = Q.shape[2]  # 获取头数
head_dim = Q.shape[4]  # 获取每一个头的维度

sample_neuron_activations = []
for sample_id in range(sample_num):
    if sample_id % 25 == 0 :
        print(f"====sample {sample_id}====")
    for layer_id in range(layer_num):
        layer_neuron_activations = []
        for head_id in range(head_num):
            head_neuron_activations = []
            for j in range(head_dim):   # 一个头
                # 获取 Q 的第 j 列
                Q_j = Q[sample_id][layer_id][head_id][:, j].reshape(-1, 1)
                # 获取 K 的第 j 列
                K_j = K[sample_id][layer_id][head_id][:, j].reshape(1, -1)
                # 通过矩阵乘法，得到一个人工神经元
                neuron = np.matmul(Q_j, K_j)
                # 激活值
                neuron_activation = np.mean(neuron)
                # 将每个神经元激活值添加到头激活矩阵
                head_neuron_activations.append(neuron_activation)

            layer_neuron_activations.append(head_neuron_activations)
            del head_neuron_activations

        sample_neuron_activations.append(layer_neuron_activations)
        del layer_neuron_activations
        
sample_neuron_activations_np = np.array(sample_neuron_activations)
sample_neuron_activations_np_reshape = sample_neuron_activations_np.reshape(sample_num, layer_num * head_num * head_dim)

save_dir = f"result_new/{movie}/qk"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    print(f"Directory {save_dir} create")

save_path = os.path.join(save_dir, f'{modality}_qk.npy')

np.save(save_path, sample_neuron_activations_np_reshape)
print("Done!")