# 测试hot参数量

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm  # 进度条

# 配置
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
num_samples = 100  # 批量样本数量，可调整
max_length = 128   # 最大序列长度，控制内存
dataset_name = "wikitext"  # 数据集名称，可换成其他如"gsm8k"
split = "test"  # 数据集拆分

# 加载模型和tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    model.eval()  # 设置为评估模式
    print("模型加载成功！")
except Exception as e:
    print(f"加载模型失败：{e}")
    exit(1)

# 加载数据集（取前num_samples个样本）
dataset = load_dataset(dataset_name, "wikitext-2-raw-v1", split=split)  # 示例数据集
# 修改：直接访问'text'列的切片，返回字符串列表
text_list = dataset['text'][:num_samples]  # 获取前num_samples个文本字符串
samples = [text[:max_length] for text in text_list if text.strip()]  # 截断、过滤空行
if len(samples) < num_samples:
    print(f"警告: 数据集只有 {len(samples)} 个有效样本")
print(f"使用 {len(samples)} 个样本进行批量分析")

# 获取层数（Qwen的Transformer层）
num_layers = len(model.model.layers)
print(f"模型总层数: {num_layers}")

# 定义钩子函数（为每个层准备一个激活列表）
activations_per_layer = [[] for _ in range(num_layers)]  # 每个层一个列表，存储所有样本的激活

def hook_fn(layer_idx):
    def fn(module, input, output):
        if isinstance(output, tuple):
            activations_per_layer[layer_idx].append(output[0].detach().cpu())  # 移到CPU节省GPU内存
        else:
            activations_per_layer[layer_idx].append(output.detach().cpu())
    return fn

# 为所有层的mlp注册钩子
handles = []
for i in range(num_layers):
    hook = model.model.layers[i].mlp.register_forward_hook(hook_fn(i))
    handles.append(hook)

# 批量运行推理
with torch.no_grad():
    for text in tqdm(samples, desc="处理样本"):
        if not text.strip():  # 跳过空文本
            continue
        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True).to(device)
        _ = model(**inputs)  # 运行推理，钩子会自动捕获

# 移除所有钩子
for handle in handles:
    handle.remove()

# 分析激活
if not any(activations_per_layer):  # 检查是否捕获到数据
    print("未捕获到激活值，请检查钩子注册或输入。")
else:
    # 计算整体平均激活密度（避免cat变长张量：分别计算每个激活的密度，然后平均）
    densities = []
    for layer_acts in activations_per_layer:
        for act in layer_acts:
            if act.numel() > 0:  # 跳过空张量
                density = (act.abs() > 1e-3).float().mean().item()
                densities.append(density)
    if densities:
        avg_density = sum(densities) / len(densities)
        print(f"\n整体平均激活密度 (基于 {len(samples)} 个样本): {avg_density * 100:.2f}% (越低表示越稀疏)")
    else:
        print("无有效激活数据，无法计算密度")

    # 为每个层计算冷热神经元百分比（处理变长seq_len：先沿seq_len平均，再聚合）
    for layer_idx in range(num_layers):
        layer_acts = activations_per_layer[layer_idx]
        if not layer_acts:  
            print(f"层 {layer_idx}: 无激活数据")
            continue
        # 对每个样本的激活沿seq_len维度平均，得到[1, hidden_size]列表
        per_sample_means = [act.mean(dim=1) for act in layer_acts if act.numel() > 0]  # [1, hidden_size] per sample
        if not per_sample_means:
            print(f"层 {layer_idx}: 无有效激活数据")
            continue
        # cat成 [num_valid_samples, hidden_size]，然后沿样本维度平均
        aggregated_act = torch.cat(per_sample_means, dim=0).abs().mean(dim=0)  # [hidden_size]
        avg_activation = aggregated_act.mean().item()
        total_neurons = aggregated_act.size(0)
        hot_neurons = (aggregated_act > avg_activation).sum().item()
        hot_pct = (hot_neurons / total_neurons * 100) if total_neurons > 0 else 0
        cold_pct = 100 - hot_pct
        print(f"层 {layer_idx}: 总神经元 {total_neurons}, 热神经元 {hot_neurons} ({hot_pct:.2f}%), 冷神经元 {total_neurons - hot_neurons} ({cold_pct:.2f}%)")

使用设备: cuda
模型加载成功！
警告: 数据集只有 61 个有效样本
使用 61 个样本进行批量分析
模型总层数: 28


处理样本: 100%|██████████| 61/61 [00:01<00:00, 56.53it/s]



整体平均激活密度 (基于 61 个样本): 99.82% (越低表示越稀疏)
层 0: 总神经元 1536, 热神经元 574 (37.37%), 冷神经元 962 (62.63%)
层 1: 总神经元 1536, 热神经元 290 (18.88%), 冷神经元 1246 (81.12%)
层 2: 总神经元 1536, 热神经元 143 (9.31%), 冷神经元 1393 (90.69%)
层 3: 总神经元 1536, 热神经元 452 (29.43%), 冷神经元 1084 (70.57%)
层 4: 总神经元 1536, 热神经元 428 (27.86%), 冷神经元 1108 (72.14%)
层 5: 总神经元 1536, 热神经元 475 (30.92%), 冷神经元 1061 (69.08%)
层 6: 总神经元 1536, 热神经元 618 (40.23%), 冷神经元 918 (59.77%)
层 7: 总神经元 1536, 热神经元 624 (40.62%), 冷神经元 912 (59.38%)
层 8: 总神经元 1536, 热神经元 641 (41.73%), 冷神经元 895 (58.27%)
层 9: 总神经元 1536, 热神经元 585 (38.09%), 冷神经元 951 (61.91%)
层 10: 总神经元 1536, 热神经元 581 (37.83%), 冷神经元 955 (62.17%)
层 11: 总神经元 1536, 热神经元 596 (38.80%), 冷神经元 940 (61.20%)
层 12: 总神经元 1536, 热神经元 627 (40.82%), 冷神经元 909 (59.18%)
层 13: 总神经元 1536, 热神经元 602 (39.19%), 冷神经元 934 (60.81%)
层 14: 总神经元 1536, 热神经元 581 (37.83%), 冷神经元 955 (62.17%)
层 15: 总神经元 1536, 热神经元 531 (34.57%), 冷神经元 1005 (65.43%)
层 16: 总神经元 1536, 热神经元 584 (38.02%), 冷神经元 952 (61.98%)
层 17: 总神经元 1536, 热神经元 623 (40.56%), 冷神经元 913 (5

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm  # 进度条
import os
import json

# 可选：启用expandable_segments以避免内存碎片（如果需要）
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 配置
# model_name =    "Qwen/Qwen2.5-1.5B-Instruct"
# model_name =    "meta-llama/Llama-3.2-1B-Instruct" # Llama3.2-1B (Instruct版本)
# model_name =    "Qwen/Qwen1.5-1.8B-Chat"          # Qwen1.5-1.8B (Chat版本)
model_name =    "google/gemma-2-2b-it"             # Gemma2-2B (Instruct版本)
# model_name =    "microsoft/phi-2"                  # Phi-2-2.7B (基础版本，无Instruct)
# model_name =    "microsoft/Phi-3.5-mini-instruct"  # Phi-3.5-Mini-Instruct (3.8B Mini版本)
# model_name =    "Qwen/Qwen1.5-4B-Chat"            # Qwen1.5 4B (Chat版本)
# model_name =    "THUDM/chatglm2-6b"                # ChatGLM2 6B
# model_name =    "facebook/opt-6.7b"                # OPT6.7B (基础版本)
# model_name =    "mistralai/Mistral-7B-Instruct-v0.1" # Mistral-7B (Instruct版本)
# model_name =    "Qwen/Qwen2-7B-Instruct"          # Qwen2-7B (Instruct版本)
# model_name =    "meta-llama/Meta-Llama-3-8B-Instruct" # LLaMA3-8B (Instruct版本)
num_samples = 500  # 每个数据集的样本数量，可调整为1000+
max_length = 128   # 最大序列长度，控制内存

# 数据集列表：(dataset_name, config, split, text_column)
# text_column 是提取文本的键；如果需要组合多个列，在代码中处理
# 更新：gsm8k的config从None改为"main"
datasets_to_use = [
    ("wikitext", "wikitext-2-raw-v1", "test", "text"),  # 文本数据集
    ("gsm8k", "main", "train", "question"),  # 数学问题（使用'main'配置）
    ("cc_news", None, "train", "text"),  # 新闻文本（bookcorpus的替代）
    ("squad", None, "train", "question"),  # 问答（将组合question + context）
    ("cnn_dailymail", "3.0.0", "train", "article")  # 新闻文章
]

# 加载模型和tokenizer（只加载一次）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    model.eval()  # 设置为评估模式
    print("模型加载成功！")
except Exception as e:
    print(f"加载模型失败：{e}")
    exit(1)

# 获取层数（Qwen的Transformer层）
num_layers = len(model.model.layers)
print(f"模型总层数: {num_layers}")

# 为所有层的mlp注册钩子（只注册一次）
activations_per_layer = None  # 将在循环中重置
handles = []
def hook_fn(layer_idx):
    def fn(module, input, output):
        if isinstance(output, tuple):
            activations_per_layer[layer_idx].append(output[0].detach().cpu())
        else:
            activations_per_layer[layer_idx].append(output.detach().cpu())
    return fn
for i in range(num_layers):
    hook = model.model.layers[i].mlp.register_forward_hook(hook_fn(i))
    handles.append(hook)

# 循环每个数据集
for ds_name, ds_config, ds_split, text_column in datasets_to_use:
    print(f"\n=== 处理数据集: {ds_name} (split: {ds_split}) ===")
    try:
        # 加载数据集
        if ds_config:
            dataset = load_dataset(ds_name, ds_config, split=ds_split)
        else:
            dataset = load_dataset(ds_name, split=ds_split)
        
        # 提取前num_samples个样本的文本
        text_list = dataset[text_column][:num_samples]  # 获取文本列表
        
        # 特殊处理：如果数据集需要组合多个列（如squad）
        if ds_name == "squad":
            context_list = dataset['context'][:num_samples]
            text_list = [q + " " + c for q, c in zip(text_list, context_list)]  # 组合question + context
        
        samples = [text[:max_length] for text in text_list if isinstance(text, str) and text.strip()]  # 截断、过滤无效
        if len(samples) < num_samples:
            print(f"警告: 数据集只有 {len(samples)} 个有效样本")
        if len(samples) == 0:
            print("跳过: 无有效样本")
            continue
        print(f"使用 {len(samples)} 个样本进行分析")
        
        # 重置激活列表
        activations_per_layer = [[] for _ in range(num_layers)]
        
        # 批量运行推理
        with torch.no_grad():
            for text in tqdm(samples, desc="处理样本"):
                if not text.strip():
                    continue
                inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True).to(device)
                _ = model(**inputs)  # 运行推理，钩子捕获
        
        # 分析激活（与之前相同，处理变长序列）
        if not any(activations_per_layer):
            print("未捕获到激活值，跳过分析")
            continue
        
        # 计算整体平均激活密度
        densities = []
        for layer_acts in activations_per_layer:
            for act in layer_acts:
                if act.numel() > 0:
                    density = (act.abs() > 1e-3).float().mean().item()
                    densities.append(density)
        if densities:
            avg_density = sum(densities) / len(densities)
        else:
            avg_density = 0.0
            print("无有效激活数据，无法计算密度")
        
        # 为每个层计算冷热神经元百分比，并收集数据
        layers_data = []
        for layer_idx in range(num_layers):
            layer_acts = activations_per_layer[layer_idx]
            if not layer_acts:
                continue
            per_sample_means = [act.mean(dim=1) for act in layer_acts if act.numel() > 0]
            if not per_sample_means:
                continue
            aggregated_act = torch.cat(per_sample_means, dim=0).abs().mean(dim=0)  # [hidden_size]
            avg_activation = aggregated_act.mean().item()
            total_neurons = aggregated_act.size(0)
            hot_neurons = (aggregated_act > avg_activation).sum().item()
            hot_pct = (hot_neurons / total_neurons * 100) if total_neurons > 0 else 0
            cold_pct = 100 - hot_pct
            layers_data.append({
                "layer_id": layer_idx,
                "total_neurons": total_neurons,
                "hot_neurons": hot_neurons,
                "hot_pct": hot_pct,
                "cold_neurons": total_neurons - hot_neurons,
                "cold_pct": cold_pct
            })
        
        # 保存为JSON：创建模型名称的文件夹（清理斜杠为下划线）
        model_folder = model_name.replace("/", "_").replace("-", "_")  # 清理为有效文件夹名，例如 "meta_llama_Llama_3_2_1B_Instruct"
        os.makedirs(model_folder, exist_ok=True)  # 创建文件夹，如果不存在
        json_filename = f"{ds_name}_activation_results.json"
        json_path = os.path.join(model_folder, json_filename)  # 完整路径
        results = {
            "dataset": ds_name,
            "split": ds_split,
            "num_samples": len(samples),
            "average_density": avg_density,
            "layers": layers_data
        }
        with open(json_path, 'w') as f:
            json.dump(results, f, indent=4)
        print(f"结果保存为: {json_path}")
        
        # 清空GPU缓存，准备下一个数据集
        torch.cuda.empty_cache()
    
    except Exception as e:
        print(f"处理 {ds_name} 失败: {e}")

# 移除钩子（在所有数据集后）
for handle in handles:
    handle.remove()
print("\n所有数据集处理完成！")

使用设备: cuda
加载模型失败：You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2-2b-it.
403 Client Error. (Request ID: Root=1-688074f7-7a26949d638d875d64a7133e;ed9af414-1386-43f9-89b7-8af070807d52)

Cannot access gated repo for url https://huggingface.co/google/gemma-2-2b-it/resolve/main/config.json.
Access to model google/gemma-2-2b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2-2b-it to ask for access.
模型总层数: 24

=== 处理数据集: wikitext (split: test) ===
警告: 数据集只有 321 个有效样本
使用 321 个样本进行分析


处理样本: 100%|██████████| 321/321 [00:06<00:00, 47.45it/s]


结果保存为: google_gemma_2_2b_it/wikitext_activation_results.json

=== 处理数据集: gsm8k (split: train) ===
使用 500 个样本进行分析


处理样本: 100%|██████████| 500/500 [00:11<00:00, 43.32it/s]


结果保存为: google_gemma_2_2b_it/gsm8k_activation_results.json

=== 处理数据集: cc_news (split: train) ===
使用 500 个样本进行分析


处理样本: 100%|██████████| 500/500 [00:10<00:00, 45.74it/s]


结果保存为: google_gemma_2_2b_it/cc_news_activation_results.json

=== 处理数据集: squad (split: train) ===
使用 500 个样本进行分析


处理样本: 100%|██████████| 500/500 [00:08<00:00, 59.04it/s]


结果保存为: google_gemma_2_2b_it/squad_activation_results.json

=== 处理数据集: cnn_dailymail (split: train) ===
使用 500 个样本进行分析


处理样本: 100%|██████████| 500/500 [00:07<00:00, 67.87it/s]


结果保存为: google_gemma_2_2b_it/cnn_dailymail_activation_results.json

所有数据集处理完成！


## 根据模型结构灵活调整，检测激活稀疏性质

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import os
import json
import logging

# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 可选：启用expandable_segments以避免内存碎片
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

class ModelArchitectureAdapter:
    """适配不同模型架构的辅助类"""
    
    @staticmethod
    def get_model_layers(model, model_name):
        """根据模型类型获取transformer层"""
        model_name_lower = model_name.lower()
        
        # ChatGLM系列
        if "chatglm" in model_name_lower:
            if hasattr(model, 'transformer') and hasattr(model.transformer, 'encoder'):
                return model.transformer.encoder.layers
            elif hasattr(model, 'transformer') and hasattr(model.transformer, 'layers'):
                return model.transformer.layers
        
        # OPT系列
        elif "opt" in model_name_lower:
            return model.model.decoder.layers
        
        # Phi系列
        elif "phi" in model_name_lower:
            if hasattr(model, 'model') and hasattr(model.model, 'layers'):
                return model.model.layers
            elif hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
                return model.transformer.h
        
        # Gemma系列
        elif "gemma" in model_name_lower:
            if hasattr(model, 'model') and hasattr(model.model, 'layers'):
                return model.model.layers
        
        # LLaMA, Mistral, Qwen等标准架构
        elif hasattr(model, 'model') and hasattr(model.model, 'layers'):
            return model.model.layers
        
        # 备用：直接访问layers
        elif hasattr(model, 'layers'):
            return model.layers
        
        # 备用：transformer.layers
        elif hasattr(model, 'transformer') and hasattr(model.transformer, 'layers'):
            return model.transformer.layers
        
        else:
            raise ValueError(f"无法识别模型 {model_name} 的层结构")
    
    @staticmethod
    def get_mlp_modules(layer, model_name):
        """根据模型类型获取MLP模块，返回需要监控的模块列表"""
        modules_to_monitor = []
        model_name_lower = model_name.lower()
        
        # 标准MLP结构
        if hasattr(layer, 'mlp'):
            modules_to_monitor.append(('mlp', layer.mlp))
        
        # ChatGLM的FFN结构
        elif hasattr(layer, 'mlp') and hasattr(layer.mlp, 'dense_h_to_4h'):
            modules_to_monitor.append(('mlp.dense_h_to_4h', layer.mlp.dense_h_to_4h))
        
        # 一些模型使用feed_forward
        elif hasattr(layer, 'feed_forward'):
            modules_to_monitor.append(('feed_forward', layer.feed_forward))
        
        # Phi模型可能使用不同的名称
        elif hasattr(layer, 'fc1') and hasattr(layer, 'fc2'):
            modules_to_monitor.append(('fc1', layer.fc1))
        
        # 如果都没有，尝试获取所有包含fc或mlp的子模块
        else:
            for name, module in layer.named_modules():
                if any(key in name.lower() for key in ['mlp', 'fc', 'ffn', 'feed_forward']):
                    if hasattr(module, 'forward') and len(list(module.parameters())) > 0:
                        modules_to_monitor.append((name, module))
                        break
        
        if not modules_to_monitor:
            logger.warning(f"模型 {model_name} 的层中未找到MLP模块，尝试监控整个层")
            modules_to_monitor.append(('layer', layer))
        
        return modules_to_monitor

class SparsityDetector:
    def __init__(self, model_name, device='cuda', use_single_gpu=True):
        self.model_name = model_name
        self.use_single_gpu = use_single_gpu
        
        # 设置设备
        if use_single_gpu and torch.cuda.is_available():
            self.device = torch.device('cuda:0')
            self.device_map = None
        elif torch.cuda.is_available():
            self.device = None
            self.device_map = 'auto'
        else:
            self.device = torch.device('cpu')
            self.device_map = None
            
        self.model = None
        self.tokenizer = None
        self.layers = None
        self.handles = []
        self.activations_per_layer = None
        
        logger.info(f"使用设备: {self.device if self.device else 'auto (multi-GPU)'}")
        self._load_model()
        self._setup_hooks()
    
    def _load_model(self):
        """加载模型和tokenizer"""
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            # 设置pad_token（如果没有）
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # 加载模型
            if self.use_single_gpu and self.device.type == 'cuda':
                # 单GPU模式：加载到单个GPU
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float16,
                    low_cpu_mem_usage=True
                ).to(self.device)
            elif self.device_map == 'auto':
                # 多GPU模式
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float16,
                    device_map=self.device_map,
                    low_cpu_mem_usage=True
                )
            else:
                # CPU模式
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float32,
                    low_cpu_mem_usage=True
                ).to(self.device)
            
            self.model.eval()
            logger.info(f"模型 {self.model_name} 加载成功！")
            
            # 获取模型层
            self.layers = ModelArchitectureAdapter.get_model_layers(self.model, self.model_name)
            logger.info(f"模型总层数: {len(self.layers)}")
            
        except Exception as e:
            logger.error(f"加载模型失败：{e}")
            raise
    
    def _setup_hooks(self):
        """设置前向钩子"""
        self.activations_per_layer = [[] for _ in range(len(self.layers))]
        
        def create_hook(layer_idx):
            def hook_fn(module, input, output):
                # 处理不同类型的输出
                if isinstance(output, tuple):
                    act = output[0]
                elif isinstance(output, dict):
                    act = output.get('hidden_states', output.get('last_hidden_state', None))
                    if act is None:
                        act = list(output.values())[0]
                else:
                    act = output
                
                if act is not None and isinstance(act, torch.Tensor):
                    # 确保激活值在CPU上以节省GPU内存
                    self.activations_per_layer[layer_idx].append(act.detach().cpu())
            return hook_fn
        
        # 为每层注册钩子
        for i, layer in enumerate(self.layers):
            mlp_modules = ModelArchitectureAdapter.get_mlp_modules(layer, self.model_name)
            for module_name, module in mlp_modules:
                hook = module.register_forward_hook(create_hook(i))
                self.handles.append(hook)
                logger.debug(f"Layer {i}: 在 {module_name} 上注册了钩子")
    
    def process_dataset(self, dataset_info, num_samples=500, max_length=128, batch_size=1):
        """处理单个数据集"""
        ds_name, ds_config, ds_split, text_column = dataset_info
        logger.info(f"\n=== 处理数据集: {ds_name} (split: {ds_split}) ===")
        
        try:
            # 加载数据集
            if ds_config:
                dataset = load_dataset(ds_name, ds_config, split=ds_split, streaming=False)
            else:
                dataset = load_dataset(ds_name, split=ds_split, streaming=False)
            
            # 提取文本
            texts = self._extract_texts(dataset, ds_name, text_column, num_samples)
            
            if not texts:
                logger.warning("无有效样本，跳过")
                return None
            
            logger.info(f"使用 {len(texts)} 个样本进行分析")
            
            # 重置激活值
            self.activations_per_layer = [[] for _ in range(len(self.layers))]
            
            # 批量推理
            self._run_inference(texts, max_length, batch_size)
            
            # 分析结果
            results = self._analyze_activations(ds_name, ds_split, len(texts))
            
            # 清空GPU缓存
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            return results
            
        except Exception as e:
            logger.error(f"处理 {ds_name} 失败: {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def _extract_texts(self, dataset, ds_name, text_column, num_samples):
        """从数据集提取文本"""
        # 获取数据
        total_samples = len(dataset)
        actual_samples = min(num_samples, total_samples)
        
        if ds_name == "squad":
            # 特殊处理：组合question和context
            texts = []
            for i in range(actual_samples):
                q = dataset[i]['question']
                c = dataset[i]['context']
                texts.append(f"{q} {c}")
        else:
            # 通用处理
            texts = []
            for i in range(actual_samples):
                text = dataset[i][text_column]
                if isinstance(text, str) and text.strip():
                    texts.append(text)
        
        return texts[:num_samples]
    
    def _run_inference(self, texts, max_length, batch_size=1):
        """运行推理并收集激活值"""
        with torch.no_grad():
            for i in tqdm(range(0, len(texts), batch_size), desc="处理样本"):
                batch_texts = texts[i:i+batch_size]
                
                try:
                    # 准备输入
                    inputs = self.tokenizer(
                        batch_texts, 
                        return_tensors="pt", 
                        max_length=max_length, 
                        truncation=True,
                        padding=True
                    )
                    
                    # 将输入移动到正确的设备
                    if self.device:
                        inputs = {k: v.to(self.device) for k, v in inputs.items()}
                    elif hasattr(self.model, 'device'):
                        # 多GPU情况下，使用模型的第一个设备
                        first_device = next(self.model.parameters()).device
                        inputs = {k: v.to(first_device) for k, v in inputs.items()}
                    
                    # 推理
                    _ = self.model(**inputs)
                    
                except Exception as e:
                    logger.warning(f"处理批次失败: {e}")
                    # 尝试逐个处理该批次中的样本
                    for text in batch_texts:
                        try:
                            inputs = self.tokenizer(
                                text, 
                                return_tensors="pt", 
                                max_length=max_length, 
                                truncation=True,
                                padding=True
                            )
                            
                            if self.device:
                                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                            elif hasattr(self.model, 'device'):
                                first_device = next(self.model.parameters()).device
                                inputs = {k: v.to(first_device) for k, v in inputs.items()}
                            
                            _ = self.model(**inputs)
                        except Exception as e2:
                            logger.warning(f"处理单个样本失败: {e2}")
                            continue
    
    def _analyze_activations(self, ds_name, ds_split, num_samples):
        """分析激活值的稀疏性"""
        if not any(self.activations_per_layer):
            logger.warning("未捕获到激活值")
            return None
        
        # 计算整体平均激活密度
        densities = []
        for layer_acts in self.activations_per_layer:
            for act in layer_acts:
                if act.numel() > 0:
                    density = (act.abs() > 1e-3).float().mean().item()
                    densities.append(density)
        
        avg_density = sum(densities) / len(densities) if densities else 0.0
        
        # 分析每层的冷热神经元
        layers_data = []
        for layer_idx in range(len(self.layers)):
            layer_acts = self.activations_per_layer[layer_idx]
            if not layer_acts:
                continue
            
            # 聚合激活值
            per_sample_means = []
            for act in layer_acts:
                if act.numel() > 0:
                    # 处理不同维度的激活值
                    if act.dim() == 3:  # [batch, seq_len, hidden]
                        mean_act = act.mean(dim=(0, 1))  # 在batch和seq维度上平均
                    elif act.dim() == 2:  # [batch, hidden]
                        mean_act = act.mean(dim=0)
                    else:
                        mean_act = act.flatten()
                    per_sample_means.append(mean_act)
            
            if not per_sample_means:
                continue
            
            # 计算平均激活值
            aggregated_act = torch.stack(per_sample_means).abs().mean(dim=0)
            avg_activation = aggregated_act.mean().item()
            total_neurons = aggregated_act.numel()
            hot_neurons = (aggregated_act > avg_activation).sum().item()
            hot_pct = (hot_neurons / total_neurons * 100) if total_neurons > 0 else 0
            
            layers_data.append({
                "layer_id": layer_idx,
                "total_neurons": total_neurons,
                "hot_neurons": hot_neurons,
                "hot_pct": hot_pct,
                "cold_neurons": total_neurons - hot_neurons,
                "cold_pct": 100 - hot_pct,
                "avg_activation": avg_activation
            })
        
        return {
            "dataset": ds_name,
            "split": ds_split,
            "num_samples": num_samples,
            "average_density": avg_density,
            "layers": layers_data
        }
    
    def cleanup(self):
        """清理钩子"""
        for handle in self.handles:
            handle.remove()
        logger.info("已移除所有钩子")

def main():
    # 配置
    model_names = [
        # "Qwen/Qwen2.5-1.5B-Instruct",
        # "meta-llama/Llama-3.2-1B-Instruct",
        # "Qwen/Qwen1.5-1.8B-Chat",
        # "google/gemma-2-2b-it",
        # "microsoft/phi-2",
        # "microsoft/Phi-3.5-mini-instruct",
        # "Qwen/Qwen1.5-4B-Chat",
        # "THUDM/chatglm2-6b",
        # "facebook/opt-6.7b",
        # "mistralai/Mistral-7B-Instruct-v0.1",
        "Qwen/Qwen2-7B-Instruct",
        # "meta-llama/Meta-Llama-3-8B-Instruct"
    ]
    
    datasets_to_use = [
        ("wikitext", "wikitext-2-raw-v1", "test", "text"),
        ("gsm8k", "main", "train", "question"),
        ("cc_news", None, "train", "text"),
        ("squad", None, "train", "question"),
        ("cnn_dailymail", "3.0.0", "train", "article")
    ]
    
    num_samples = 500
    max_length = 128
    
    # 选择要测试的模型
    selected_model = "google/gemma-2-2b-it"  # 修改这里来选择不同的模型
    
    # 创建检测器（使用单GPU模式以避免多GPU问题）
    detector = SparsityDetector(selected_model, use_single_gpu=True)
    
    # 处理每个数据集
    all_results = []
    for dataset_info in datasets_to_use:
        results = detector.process_dataset(dataset_info, num_samples, max_length, batch_size=1)
        if results:
            all_results.append(results)
            
            # 保存结果
            model_folder = selected_model.replace("/", "_").replace("-", "_")
            os.makedirs(model_folder, exist_ok=True)
            
            json_filename = f"{dataset_info[0]}_activation_results.json"
            json_path = os.path.join(model_folder, json_filename)
            
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=4, ensure_ascii=False)
            
            logger.info(f"结果保存为: {json_path}")
    
    # 清理
    detector.cleanup()
    logger.info("\n所有数据集处理完成！")
    
    # 汇总结果
    if all_results:
        summary_path = os.path.join(model_folder, "summary.json")
        summary = {
            "model": selected_model,
            "num_samples": num_samples,
            "datasets": [r["dataset"] for r in all_results],
            "average_densities": {r["dataset"]: r["average_density"] for r in all_results}
        }
        with open(summary_path, 'w', encoding='utf-8') as f:
            json.dump(summary, f, indent=4, ensure_ascii=False)
        logger.info(f"汇总结果保存为: {summary_path}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.14it/s]
处理样本:   0%|          | 0/321 [00:00<?, ?it/s]The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
处理样本: 100%|██████████| 321/321 [00:16<00:00, 19.57it/s]
处理样本: 100%|██████████| 500/500 [00:20<00:00, 24.51it/s]
处理样本: 100%|██████████| 500/500 [00:25<00:00, 19.65it/s]
处理样本: 100%|██████████| 500/500 [00:14<00:00, 33.55it/s]
处理样本: 100%|██████████| 500/500 [00:21<00:00, 22.86it/s]


## 读取生成的冷热神经元，生成热力图

生成svg格式的图片

In [8]:
import json
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
# 设置matplotlib使用支持SVG的后端
matplotlib.use('Agg')

# 配置
specific_layer = 0  # 聚焦的特定层ID（可调整为30或其他）
output_heatmap = "activation_heatmap.svg"  # 改为SVG格式
output_barplot = f"barplot_layer_{specific_layer}.svg"  # 改为SVG格式

# 设置高质量的图形参数
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 10

# 读取所有JSON文件
# json_files = glob.glob("Qwen2.5-1.5B-Instruct/*_activation_results.json")
# if not json_files:
#     print("未找到JSON文件！请先运行分析代码生成JSON。")
#     exit(1)

# 配置
# model_name =    "Qwen/Qwen2.5-1.5B-Instruct"
# model_name =    "meta-llama/Llama-3.2-1B-Instruct" # Llama3.2-1B (Instruct版本)
# model_name =    "Qwen/Qwen1.5-1.8B-Chat"          # Qwen1.5-1.8B (Chat版本)
model_name =    "google/gemma-2-2b-it"             # Gemma2-2B (Instruct版本)
# model_name =    "microsoft/phi-2"                  # Phi-2-2.7B (基础版本，无Instruct)
# model_name =    "microsoft/Phi-3.5-mini-instruct"  # Phi-3.5-Mini-Instruct (3.8B Mini版本)
# model_name =    "Qwen/Qwen1.5-4B-Chat"            # Qwen1.5 4B (Chat版本)
# model_name =    "THUDM/chatglm2-6b"                # ChatGLM2 6B
# model_name =    "facebook/opt-6.7b"                # OPT6.7B (基础版本)
# model_name =    "mistralai/Mistral-7B-Instruct-v0.1" # Mistral-7B (Instruct版本)
# model_name =    "Qwen/Qwen2-7B-Instruct"          # Qwen2-7B (Instruct版本)
# model_name =    "meta-llama/Meta-Llama-3-8B-Instruct" # LLaMA3-8B (Instruct版本)

# 根据model_name生成文件夹名称（与分析代码一致：替换/和-为_）
model_folder = model_name.replace("/", "_").replace("-", "_")  # 例如 "meta_llama_Llama_3_2_1B_Instruct"

# 读取所有JSON文件（从模型专属文件夹中读取）
json_files = glob.glob(os.path.join(model_folder, "*_activation_results.json"))
if not json_files:
    print(f"未找到JSON文件在文件夹 '{model_folder}' 中！请先运行分析代码生成JSON。")
    exit(1)

# 收集数据：{dataset: {layer_id: hot_pct}}
data = {}
for file in json_files:
    with open(file, 'r') as f:
        results = json.load(f)
        dataset = results["dataset"]
        data[dataset] = {}
        for layer in results["layers"]:
            data[dataset][layer["layer_id"]] = layer["hot_pct"]

# 构建DataFrame（行=层，列=数据集）
layers = sorted(set(layer_id for dataset_data in data.values() for layer_id in dataset_data))
datasets = sorted(data.keys())
df = pd.DataFrame(index=layers, columns=datasets)
for dataset in datasets:
    for layer in layers:
        value = data[dataset].get(layer, 0.0)
        df.at[layer, dataset] = value  # 填充值

# 修复：强制转换为float，填充NaN
df = df.astype(float)
df = df.fillna(0.0)  # 处理任何NaN

# 检查DataFrame是否为空
if df.empty:
    print("DataFrame为空（无层数据），无法生成图。")
    exit(1)

# 生成热图（类似于附件(b)：Y=层，X=数据集，颜色=热比例）
plt.figure(figsize=(12, 8))
sns.heatmap(df, 
            annot=True, 
            fmt=".2f", 
            cmap="coolwarm", 
            cbar_kws={'label': 'Hot Neuron Proportion (%)'},
            square=False,
            linewidths=0.5,
            linecolor='gray')
plt.title("Activation Frequency Across Layers and Datasets", pad=20)
plt.xlabel("Datasets (Tasks)", labelpad=10)
plt.ylabel("Layer ID", labelpad=10)
plt.tight_layout()
plt.savefig(output_heatmap, format='svg', bbox_inches='tight')
plt.close()
print(f"热图保存为: {output_heatmap}")

# 生成条形图（类似于附件(a)：聚焦特定层，Y=数据集，X=热比例）
if specific_layer in df.index:
    layer_data = df.loc[specific_layer]
    # 确保layer_data是数值Series
    layer_data = pd.to_numeric(layer_data, errors='coerce').fillna(0.0)
    
    # 按值排序，使条形图更清晰
    layer_data_sorted = layer_data.sort_values(ascending=True)
    
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=layer_data_sorted.values, 
                     y=layer_data_sorted.index, 
                     palette="coolwarm",
                     orient='h')
    
    # 在条形图上添加数值标签
    for i, (value, name) in enumerate(zip(layer_data_sorted.values, layer_data_sorted.index)):
        ax.text(value + 0.5, i, f'{value:.2f}%', va='center', ha='left', fontsize=9)
    
    plt.title(f"Hot Neuron Proportion for Layer {specific_layer} Across Datasets", pad=20)
    plt.xlabel("Hot Neuron Proportion (%)", labelpad=10)
    plt.ylabel("Datasets (Tasks)", labelpad=10)
    plt.xlim(0, max(layer_data_sorted.values) * 1.2)  # 留出空间显示标签
    plt.tight_layout()
    plt.savefig(output_barplot, format='svg', bbox_inches='tight')
    plt.close()
    print(f"条形图保存为: {output_barplot}")
else:
    print(f"特定层 {specific_layer} 不存在于数据中，无法生成条形图")

# 可选：生成额外的综合分析图
# 1. 层级平均激活率分布
output_layer_avg = "layer_average_activation.svg"
layer_means = df.mean(axis=1)
plt.figure(figsize=(10, 6))
plt.plot(layer_means.index, layer_means.values, marker='o', linewidth=2, markersize=6)
plt.title("Average Hot Neuron Proportion Across All Datasets by Layer", pad=20)
plt.xlabel("Layer ID", labelpad=10)
plt.ylabel("Average Hot Neuron Proportion (%)", labelpad=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_layer_avg, format='svg', bbox_inches='tight')
plt.close()
print(f"层级平均激活率图保存为: {output_layer_avg}")

# 2. 数据集平均激活率对比
output_dataset_avg = "dataset_average_activation.svg"
dataset_means = df.mean(axis=0).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=dataset_means.values, y=dataset_means.index, palette="viridis")
for i, (value, name) in enumerate(zip(dataset_means.values, dataset_means.index)):
    ax.text(value + 0.5, i, f'{value:.2f}%', va='center', ha='left', fontsize=9)
plt.title("Average Hot Neuron Proportion by Dataset", pad=20)
plt.xlabel("Average Hot Neuron Proportion (%)", labelpad=10)
plt.ylabel("Datasets", labelpad=10)
plt.xlim(0, max(dataset_means.values) * 1.2)
plt.tight_layout()
plt.savefig(output_dataset_avg, format='svg', bbox_inches='tight')
plt.close()
print(f"数据集平均激活率图保存为: {output_dataset_avg}")

热图保存为: activation_heatmap.svg
条形图保存为: barplot_layer_0.svg



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x=layer_data_sorted.values,


层级平均激活率图保存为: layer_average_activation.svg
数据集平均激活率图保存为: dataset_average_activation.svg



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x=dataset_means.values, y=dataset_means.index, palette="viridis")


循环生成所有模型的图像

In [None]:
import json
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import os

# 设置matplotlib使用支持SVG的后端
matplotlib.use('Agg')

# 设置高质量的图形参数
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 10

# 配置
specific_layer = 0  # 聚焦的特定层ID（可调整为30或其他）

# 所有模型列表
model_names = [
    "Qwen/Qwen2.5-1.5B-Instruct",
    "meta-llama/Llama-3.2-1B-Instruct",
    "Qwen/Qwen1.5-1.8B-Chat",
    "google/gemma-2-2b-it",
    "microsoft/phi-2",
    "microsoft/Phi-3.5-mini-instruct",
    "Qwen/Qwen1.5-4B-Chat",
    "THUDM/chatglm2-6b",
    "facebook/opt-6.7b",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "Qwen/Qwen2-7B-Instruct",
    "meta-llama/Meta-Llama-3-8B-Instruct"
]

def process_model(model_name, specific_layer=0):
    """处理单个模型的数据并生成图表"""
    print(f"\n=== 处理模型: {model_name} ===")
    
    # 根据model_name生成文件夹名称
    model_folder = model_name.replace("/", "_").replace("-", "_")
    
    # 检查文件夹是否存在
    if not os.path.exists(model_folder):
        print(f"文件夹 '{model_folder}' 不存在，跳过该模型")
        return False
    
    # 读取所有JSON文件
    json_files = glob.glob(os.path.join(model_folder, "*_activation_results.json"))
    if not json_files:
        print(f"未找到JSON文件在文件夹 '{model_folder}' 中！")
        return False
    
    # 收集数据：{dataset: {layer_id: hot_pct}}
    data = {}
    for file in json_files:
        try:
            with open(file, 'r') as f:
                results = json.load(f)
                dataset = results["dataset"]
                data[dataset] = {}
                for layer in results["layers"]:
                    data[dataset][layer["layer_id"]] = layer["hot_pct"]
        except Exception as e:
            print(f"读取文件 {file} 失败: {e}")
            continue
    
    if not data:
        print(f"没有有效数据，跳过模型 {model_name}")
        return False
    
    # 构建DataFrame（行=层，列=数据集）
    layers = sorted(set(layer_id for dataset_data in data.values() for layer_id in dataset_data))
    datasets = sorted(data.keys())
    df = pd.DataFrame(index=layers, columns=datasets)
    for dataset in datasets:
        for layer in layers:
            value = data[dataset].get(layer, 0.0)
            df.at[layer, dataset] = value
    
    # 强制转换为float，填充NaN
    df = df.astype(float)
    df = df.fillna(0.0)
    
    # 检查DataFrame是否为空
    if df.empty:
        print(f"DataFrame为空（无层数据），跳过模型 {model_name}")
        return False
    
    # 生成输出文件名（保存在模型文件夹中）
    output_heatmap = os.path.join(model_folder, "activation_heatmap.svg")
    output_barplot = os.path.join(model_folder, f"barplot_layer_{specific_layer}.svg")
    output_layer_avg = os.path.join(model_folder, "layer_average_activation.svg")
    output_dataset_avg = os.path.join(model_folder, "dataset_average_activation.svg")
    
    # 1. 生成热图
    plt.figure(figsize=(12, 8))
    sns.heatmap(df, 
                annot=True, 
                fmt=".2f", 
                cmap="coolwarm", 
                cbar_kws={'label': 'Hot Neuron Proportion (%)'},
                square=False,
                linewidths=0.5,
                linecolor='gray')
    plt.title(f"Activation Frequency Across Layers and Datasets\n{model_name}", pad=20)
    plt.xlabel("Datasets (Tasks)", labelpad=10)
    plt.ylabel("Layer ID", labelpad=10)
    plt.tight_layout()
    plt.savefig(output_heatmap, format='svg', bbox_inches='tight')
    plt.close()
    print(f"热图保存为: {output_heatmap}")
    
    # 2. 生成条形图（聚焦特定层）
    if specific_layer in df.index:
        layer_data = df.loc[specific_layer]
        layer_data = pd.to_numeric(layer_data, errors='coerce').fillna(0.0)
        layer_data_sorted = layer_data.sort_values(ascending=True)
        
        plt.figure(figsize=(10, 6))
        ax = sns.barplot(x=layer_data_sorted.values, 
                         y=layer_data_sorted.index, 
                         palette="coolwarm",
                         orient='h')
        
        for i, (value, name) in enumerate(zip(layer_data_sorted.values, layer_data_sorted.index)):
            ax.text(value + 0.5, i, f'{value:.2f}%', va='center', ha='left', fontsize=9)
        
        plt.title(f"Hot Neuron Proportion for Layer {specific_layer} Across Datasets\n{model_name}", pad=20)
        plt.xlabel("Hot Neuron Proportion (%)", labelpad=10)
        plt.ylabel("Datasets (Tasks)", labelpad=10)
        plt.xlim(0, max(layer_data_sorted.values) * 1.2 if layer_data_sorted.values.any() else 100)
        plt.tight_layout()
        plt.savefig(output_barplot, format='svg', bbox_inches='tight')
        plt.close()
        print(f"条形图保存为: {output_barplot}")
    else:
        print(f"特定层 {specific_layer} 不存在于数据中，跳过条形图")
    
    # 3. 层级平均激活率分布
    layer_means = df.mean(axis=1)
    plt.figure(figsize=(10, 6))
    plt.plot(layer_means.index, layer_means.values, marker='o', linewidth=2, markersize=6)
    plt.title(f"Average Hot Neuron Proportion Across All Datasets by Layer\n{model_name}", pad=20)
    plt.xlabel("Layer ID", labelpad=10)
    plt.ylabel("Average Hot Neuron Proportion (%)", labelpad=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(output_layer_avg, format='svg', bbox_inches='tight')
    plt.close()
    print(f"层级平均激活率图保存为: {output_layer_avg}")
    
    # 4. 数据集平均激活率对比
    dataset_means = df.mean(axis=0).sort_values(ascending=False)
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x=dataset_means.values, y=dataset_means.index, palette="viridis")
    for i, (value, name) in enumerate(zip(dataset_means.values, dataset_means.index)):
        ax.text(value + 0.5, i, f'{value:.2f}%', va='center', ha='left', fontsize=9)
    plt.title(f"Average Hot Neuron Proportion by Dataset\n{model_name}", pad=20)
    plt.xlabel("Average Hot Neuron Proportion (%)", labelpad=10)
    plt.ylabel("Datasets", labelpad=10)
    plt.xlim(0, max(dataset_means.values) * 1.2 if dataset_means.values.any() else 100)
    plt.tight_layout()
    plt.savefig(output_dataset_avg, format='svg', bbox_inches='tight')
    plt.close()
    print(f"数据集平均激活率图保存为: {output_dataset_avg}")
    
    return True

def main():
    """主函数：循环处理所有模型"""
    print("开始处理所有模型的激活数据...")
    
    # 统计处理结果
    success_count = 0
    failed_models = []
    
    # 循环处理每个模型
    for model_name in model_names:
        try:
            if process_model(model_name, specific_layer=specific_layer):
                success_count += 1
            else:
                failed_models.append(model_name)
        except Exception as e:
            print(f"处理模型 {model_name} 时发生错误: {e}")
            failed_models.append(model_name)
    
    # 打印汇总信息
    print("\n=== 处理完成 ===")
    print(f"成功处理: {success_count} 个模型")
    if failed_models:
        print(f"失败或跳过的模型 ({len(failed_models)} 个):")
        for model in failed_models:
            print(f"  - {model}")
    
    # 可选：生成跨模型对比图
    generate_cross_model_comparison()

def generate_cross_model_comparison():
    """生成跨模型对比图（可选）"""
    print("\n=== 生成跨模型对比图 ===")
    
    # 收集所有模型的平均激活率
    model_avg_data = {}
    
    for model_name in model_names:
        model_folder = model_name.replace("/", "_").replace("-", "_")
        summary_file = os.path.join(model_folder, "summary.json")
        
        if os.path.exists(summary_file):
            try:
                with open(summary_file, 'r') as f:
                    summary = json.load(f)
                    avg_densities = summary.get("average_densities", {})
                    if avg_densities:
                        # 计算该模型所有数据集的平均密度
                        model_avg = sum(avg_densities.values()) / len(avg_densities) * 100
                        model_avg_data[model_name] = model_avg
            except Exception as e:
                print(f"读取 {summary_file} 失败: {e}")
    
    if model_avg_data:
        # 创建跨模型对比图
        plt.figure(figsize=(12, 8))
        
        # 排序模型（按激活率）
        sorted_models = sorted(model_avg_data.items(), key=lambda x: x[1], reverse=True)
        models = [m[0].split('/')[-1] for m in sorted_models]  # 简化模型名称
        values = [m[1] for m in sorted_models]
        
        ax = plt.barh(models, values, color='skyblue', edgecolor='navy')
        
        # 添加数值标签
        for i, v in enumerate(values):
            plt.text(v + 0.5, i, f'{v:.1f}%', va='center', fontsize=9)
        
        plt.xlabel('Average Activation Density (%)', fontsize=12)
        plt.ylabel('Models', fontsize=12)
        plt.title('Cross-Model Activation Density Comparison', fontsize=14, pad=20)
        plt.xlim(0, max(values) * 1.2 if values else 100)
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        
        output_path = "cross_model_comparison.svg"
        plt.savefig(output_path, format='svg', bbox_inches='tight')
        plt.close()
        print(f"跨模型对比图保存为: {output_path}")
    else:
        print("没有足够的数据生成跨模型对比图")

if __name__ == "__main__":
    # 可以通过命令行参数或直接修改这里来改变specific_layer
    specific_layer = 0  # 可以改为其他层，如15、30等
    main()