In [None]:
import json, os
import torch
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
from collections import Counter
from transformers import OlmoeForCausalLM, AutoTokenizer
LAYER_NUM = 16
HEAD_NUM = 16
HEAD_DIM = 128
HIDDEN_DIM = HEAD_NUM * HEAD_DIM # 2048 
# n=1
TYPE="OLMoE"
torch.set_default_device("cuda:3")

In [None]:
def transfer_output(model_output):
    all_pos_layer_input = []
    all_pos_attn_output = []
    all_pos_residual_output = []
    all_pos_ffn_output = []
    all_pos_layer_output = []
    all_last_attn_subvalues = []
    all_experts_coefficient_scores = []
    selected_experts = []
    all_attn_scores = []
    router_weights = []
    router_logits = []
    single_attn = []
    for layer_i in range(LAYER_NUM):
        cur_layer_input = model_output[layer_i][0]
        cur_attn_output = model_output[layer_i][1]
        cur_residual_output = model_output[layer_i][2]
        cur_ffn_output = model_output[layer_i][3]
        cur_layer_output = model_output[layer_i][4]
        cur_last_attn_subvalues = model_output[layer_i][5]
        cur_experts_coefficient_scores = model_output[layer_i][6]
        cur_attn_weights = model_output[layer_i][7]
        cur_selected_experts = model_output[layer_i][8]
        cur_router_weights = model_output[layer_i][9]
        cur_router_logits = model_output[layer_i][10]
        cur_single_attn = model_output[layer_i][11]
        all_pos_layer_input.append(cur_layer_input[0].tolist())
        all_pos_attn_output.append(cur_attn_output[0].tolist())
        all_pos_residual_output.append(cur_residual_output[0].tolist())
        all_pos_ffn_output.append(cur_ffn_output[0].tolist())
        all_pos_layer_output.append(cur_layer_output[0].tolist())
        all_last_attn_subvalues.append(cur_last_attn_subvalues[0].tolist())
        all_experts_coefficient_scores.append(cur_experts_coefficient_scores.tolist())
        all_attn_scores.append(cur_attn_weights)
        selected_experts.append(cur_selected_experts.tolist())
        router_weights.append(cur_router_weights.tolist())
        router_logits.append(cur_router_logits.tolist())
        single_attn.append(cur_single_attn.tolist())
    return all_pos_layer_input, all_pos_attn_output, all_pos_residual_output, all_pos_ffn_output, \
           all_pos_layer_output, all_last_attn_subvalues, all_experts_coefficient_scores, \
               all_attn_scores, selected_experts, router_weights, router_logits, single_attn
def get_experts_fc2_params(model, layer_num, expert_idx):
    return model.model.layers[layer_num].mlp.experts[expert_idx].down_proj.weight.data
def get_bsvalues(vector, model, final_var):
    vector = vector * torch.rsqrt(final_var + 1e-6)
    vector_rmsn = vector * model.model.norm.weight.data
    vector_bsvalues = model.lm_head(vector_rmsn).data
    return vector_bsvalues
def get_prob(vector):
    prob = torch.nn.Softmax(-1)(vector)
    return prob
def transfer_l(l):
    new_x, new_y = [], []
    for x in l:
        new_x.append(x[0])
        new_y.append(x[1])
    return new_x, new_y
def plt_bar(x, y, yname="log increase"):
    x_major_locator=MultipleLocator(1)
    plt.figure(figsize=(8, 3))
    ax=plt.gca()
    ax.xaxis.set_major_locator(x_major_locator)
    plt_x = [a/2 for a in x]
    plt.xlim(-0.5, plt_x[-1]+0.49)
    x_attn, y_attn, x_ffn, y_ffn = [], [], [], []
    for i in range(len(x)):
        if i%2 == 0:
            x_attn.append(x[i]/2)
            y_attn.append(y[i])
        else:
            x_ffn.append(x[i]/2)
            y_ffn.append(y[i])
    plt.bar(x_attn, y_attn, color="darksalmon", label="attention layers")
    plt.bar(x_ffn, y_ffn, color="lightseagreen", label="FFN layers")
    plt.xlabel("layer")
    plt.ylabel(yname)
    plt.legend()
    plt.show()
def plt_heatmap(data):
    xLabel = range(len(data[0]))
    yLabel = range(len(data))
    fig = plt.figure(figsize=(10,8))
    ax = fig.add_subplot(111)
    ax.set_xticks(range(len(xLabel)))
    ax.set_xticklabels(xLabel)
    ax.set_yticks(range(len(yLabel)))
    ax.set_yticklabels(yLabel)
    im = ax.imshow(data, cmap=plt.cm.hot_r)
    plt.title("attn head log increase heatmap")
    plt.show()

In [None]:
modelname = "your own model dir"
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = OlmoeForCausalLM.from_pretrained(modelname)
model.eval()

In [None]:
parameters = {"test_sentence": "The capital of France is", "n":1}
test_sentence = parameters["test_sentence"]
# model_file = parameters.get("model_file", "default_model")
n = parameters["n"]

In [None]:
indexed_tokens = tokenizer.encode(test_sentence)
tokens = [tokenizer.decode(x) for x in indexed_tokens]
tokens_tensor = torch.tensor([indexed_tokens])
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

In [None]:
predicted_top10 = torch.argsort(predictions[0][-1], descending=True)[:10]
predicted_text = [tokenizer.decode(x) for x in predicted_top10]
print(test_sentence, "=>", predicted_text)
all_pos_layer_input, all_pos_attn_output, all_pos_residual_output, all_pos_ffn_output, all_pos_layer_output, \
all_last_attn_subvalues, all_experts_coefficient_scores, all_attn_scores, selected_experts, router_weights, \
router_logits, single_attn = transfer_output(outputs[1])
final_var = torch.tensor(all_pos_layer_output[-1][-1]).pow(2).mean(-1, keepdim=True)
pos_len = len(tokens)
print(tokens)

In [None]:
predict_index = predicted_top10[0].item()
print(predict_index, tokenizer.decode(predict_index))

In [None]:
attn_scores_data = {}


for layer in range(LAYER_NUM):
    attn_scores_data[f"layer_{layer}"] = {}
    for head in range(HEAD_NUM):
        attn_scores_data[f"layer_{layer}"][f"head_{head}"] = all_attn_scores[layer][-1][head].tolist()

# print(attn_scores_data)

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_path = os.path.join(target_dir, 'attn_scores.json')
with open(file_path, 'w') as f:
    json.dump(attn_scores_data, f, indent=4)


In [None]:
experts_data = {}

for layer in range(LAYER_NUM):
    experts_data[f"layer_{layer}"] = selected_experts[layer][-1]

# print(experts_data)

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_path = os.path.join(target_dir, 'selected_experts.json')
with open(file_path, 'w') as f:
    json.dump(experts_data, f, indent=4)

In [None]:
attn_outputs = {}


for layer in range(LAYER_NUM):
    attn_outputs[f"layer_{layer}"] = single_attn[layer][-1]

# print(attn_outputs)

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_path = os.path.join(target_dir, 'attn_outputs.json')
with open(file_path, 'w') as f:
    json.dump(attn_outputs, f, indent=4)


In [None]:
experts_data = {}


for layer in range(LAYER_NUM):
    experts_data[f"layer_{layer}"] = selected_experts[layer]

# print(experts_data)

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_path = os.path.join(target_dir, 'selected_experts.json')
with open(file_path, 'w') as f:
    json.dump(experts_data, f, indent=4)


In [None]:
router_logits_data = {}


for layer in range(LAYER_NUM):
    router_logits_data[f"layer_{layer}"] = router_logits[layer]

# print(router_logits_data)

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_path = os.path.join(target_dir, 'router_logits.json')
with open(file_path, 'w') as f:
    json.dump(router_logits_data, f, indent=4)


In [None]:
router_data = {}


for layer in range(LAYER_NUM):
    router_data[f"layer_{layer}"] = router_weights[layer]

# print(router_data)

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_path = os.path.join(target_dir, 'router.json')
with open(file_path, 'w') as f:
    json.dump(router_data, f, indent=4)


In [None]:
attn_scores_data = {}


for layer in range(LAYER_NUM):
    attn_scores_data[f"layer_{layer}"] = {}
    for head in range(HEAD_NUM):
        attn_scores_data[f"layer_{layer}"][f"head_{head}"] = all_attn_scores[layer][-1][head].tolist()

# print(attn_scores_data)

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

file_path = os.path.join(target_dir, 'attn_scores.json')
with open(file_path, 'w') as f:
    json.dump(attn_scores_data, f, indent=4)

In [None]:
#layer-level increase (value layers)
all_attn_log_increase = []
for layer_i in range(LAYER_NUM):
    cur_attn_vector = torch.tensor(all_pos_attn_output[layer_i][-1])
    cur_layer_input = torch.tensor(all_pos_layer_input[layer_i][-1])
    origin_prob_log = torch.log(get_prob(get_bsvalues(cur_layer_input, model, final_var))[predict_index])
    cur_attn_vector_plus = cur_attn_vector + cur_layer_input
    cur_attn_vector_bsvalues = get_bsvalues(cur_attn_vector_plus, model, final_var)
    cur_attn_vector_probs = get_prob(cur_attn_vector_bsvalues)
    cur_attn_vector_probs = cur_attn_vector_probs[predict_index]
    cur_attn_vector_probs_log = torch.log(cur_attn_vector_probs)
    cur_attn_vector_probs_log_increase = cur_attn_vector_probs_log - origin_prob_log
    all_attn_log_increase.append(cur_attn_vector_probs_log_increase.item())
all_ffn_log_increase = []
for layer_i in range(LAYER_NUM):
    cur_ffn_vector = torch.tensor(all_pos_ffn_output[layer_i][-1])
    cur_residual = torch.tensor(all_pos_residual_output[layer_i][-1])
    origin_prob_log = torch.log(get_prob(get_bsvalues(cur_residual, model, final_var))[predict_index])
    cur_ffn_vector_plus = cur_ffn_vector + cur_residual
    cur_ffn_vector_bsvalues = get_bsvalues(cur_ffn_vector_plus, model, final_var)
    cur_ffn_vector_probs = get_prob(cur_ffn_vector_bsvalues)
    cur_ffn_vector_probs = cur_ffn_vector_probs[predict_index]
    cur_ffn_vector_probs_log = torch.log(cur_ffn_vector_probs)
    cur_ffn_vector_probs_log_increase = cur_ffn_vector_probs_log - origin_prob_log
    all_ffn_log_increase.append(cur_ffn_vector_probs_log_increase.tolist())
attn_list, ffn_list = [], []
for layer_i in range(LAYER_NUM):
    attn_list.append([str(layer_i), all_attn_log_increase[layer_i]])
    ffn_list.append([str(layer_i), all_ffn_log_increase[layer_i]])
attn_list_sort = sorted(attn_list, key=lambda x: x[-1])[::-1]#[:10]
ffn_list_sort = sorted(ffn_list, key=lambda x: x[-1])[::-1]#[:10]
attn_increase_compute, ffn_increase_compute = [], []
for indx, increase in attn_list_sort:
    attn_increase_compute.append((indx, round(increase, 3)))
for indx, increase in ffn_list_sort:
    ffn_increase_compute.append((indx, round(increase, 3)))
print("attn sum: ", sum([x[1] for x in attn_increase_compute]), 
      "ffn sum: ", sum([x[1] for x in ffn_increase_compute]))
print("attn: ", attn_increase_compute)
print("ffn: ", ffn_increase_compute)
all_increases_draw = []
for i in range(len(attn_list)):
    all_increases_draw.append(attn_list[i][1])
    all_increases_draw.append(ffn_list[i][1])    
plt_bar(range(len(all_increases_draw)), all_increases_draw)

In [None]:
output_data = {
    "attn_list": attn_list,
    "ffn_list": ffn_list
}

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
file_path = os.path.join(target_dir, f'layer_level_increase.json')
with open(file_path, 'w') as f:
    json.dump(output_data, f)

In [None]:
#head-level increase (value heads)
all_head_increase = []
for test_layer in range(LAYER_NUM):
    cur_layer_input = torch.tensor(all_pos_layer_input[test_layer])
    cur_v_heads = torch.tensor(all_last_attn_subvalues[test_layer])
    cur_attn_o_split = model.model.layers[test_layer].self_attn.o_proj.weight.data.T.view(HEAD_NUM, HEAD_DIM, -1)
    cur_attn_subvalues_headrecompute = torch.bmm(cur_v_heads, cur_attn_o_split).permute(1, 0, 2)
    cur_attn_subvalues_head_sum = torch.sum(cur_attn_subvalues_headrecompute, 0)
    cur_layer_input_last = cur_layer_input[-1]
    origin_prob = torch.log(get_prob(get_bsvalues(cur_layer_input_last, model, final_var))[predict_index])
    cur_attn_subvalues_head_plus = cur_attn_subvalues_head_sum + cur_layer_input_last
    cur_attn_plus_probs = torch.log(get_prob(get_bsvalues(
            cur_attn_subvalues_head_plus, model, final_var))[:, predict_index])
    cur_attn_plus_probs_increase = cur_attn_plus_probs - origin_prob
    for i in range(len(cur_attn_plus_probs_increase)):
        all_head_increase.append([str(test_layer)+"_"+str(i), round(cur_attn_plus_probs_increase[i].item(), 4)])

all_head_increase_sort = sorted(all_head_increase, key=lambda x:x[-1])[::-1]
print(all_head_increase_sort[:30])
all_head_increase_list = [x[1] for x in all_head_increase]
all_head_increase_list_split = torch.tensor(all_head_increase_list).view((LAYER_NUM, HEAD_NUM)).permute((1,0)).tolist()
plt_heatmap(all_head_increase_list_split)

In [None]:
print(all_head_increase)
output_data = {
    "all_head_increase": all_head_increase
}

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
file_path = os.path.join(target_dir, f'head_level_increase.json')
with open(file_path, 'w') as f:
    json.dump(output_data, f)

In [None]:
#activated experts FFN neuron increase 
all_experts_ffn_subvalues = []
for layer_i in range(LAYER_NUM):
    expert_i = 0
    for expert_idx in selected_experts[layer_i][-1]:
        experts_coefficient_scores = torch.tensor(all_experts_coefficient_scores[layer_i][-1][expert_i])
        expert_i = expert_i + 1
        experts_fc2_vectors = get_experts_fc2_params(model, layer_i, expert_idx)
        experts_ffn_subvalues = (experts_coefficient_scores * experts_fc2_vectors).T
        all_experts_ffn_subvalues.append(experts_ffn_subvalues)
experts_ffn_subvalue_list = []
for layer_i in range(LAYER_NUM):
    expert_i = 0
    for expert_idx in selected_experts[layer_i][-1]:  
        cur_experts_ffn_subvalues = all_experts_ffn_subvalues[layer_i * len(selected_experts[layer_i][-1]) + expert_i]
        cur_residual = torch.tensor(all_pos_residual_output[layer_i][-1])
        origin_prob_log = torch.log(get_prob(get_bsvalues(cur_residual, model, final_var))[predict_index])
        router_weight = router_weights[layer_i][-1][expert_i]
        cur_experts_ffn_subvalues_plus = (cur_experts_ffn_subvalues * router_weight) + cur_residual
        cur_ffn_subvalues_bsvalues = get_bsvalues(cur_experts_ffn_subvalues_plus, model, final_var)
        cur_ffn_subvalues_probs = get_prob(cur_ffn_subvalues_bsvalues)
        cur_ffn_subvalues_probs = cur_ffn_subvalues_probs[:, predict_index]
        cur_ffn_subvalues_probs_log = torch.log(cur_ffn_subvalues_probs)
        cur_ffn_subvalues_probs_log_increase = cur_ffn_subvalues_probs_log - origin_prob_log
        for index, experts_ffn_increase in enumerate(cur_ffn_subvalues_probs_log_increase):
            experts_ffn_subvalue_list.append([f"{layer_i}_{expert_idx}_{index}", experts_ffn_increase.item()])
        expert_i += 1
experts_ffn_subvalue_list_sort = sorted(experts_ffn_subvalue_list, key=lambda x: x[-1])[::-1]
for x in experts_ffn_subvalue_list_sort[:10]:
    print(x[0], round(x[1], 4))
    layer = int(x[0].split("_")[0])
    expert_idx = int(x[0].split("_")[1])
    neuron = int(x[0].split("_")[2])
    cur_vector = get_experts_fc2_params(model, layer, expert_idx).T[neuron]
    cur_vector_bsvalue = get_bsvalues(cur_vector, model, final_var)
    cur_vector_bsvalue_sort = torch.argsort(cur_vector_bsvalue, descending=True)
    print("top10: ", [tokenizer.decode(a) for a in cur_vector_bsvalue_sort[:10]])
    print("last10: ", [tokenizer.decode(a) for a in cur_vector_bsvalue_sort[-10:].tolist()[::-1]])

In [None]:
data = experts_ffn_subvalue_list_sort
experts_ffn_neuron_data = [
    {
        "layer": int(item[0].split('_')[0]),
        "experts": int(item[0].split('_')[1]),
        "neuron": int(item[0].split('_')[2]),
        "increase": item[1]
    }
    for item in data
]

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
file_path = os.path.join(target_dir, f'experts_FFN_neuron_increase.json')
with open(file_path, 'w') as f:
    json.dump(experts_ffn_neuron_data, f, indent=4)


In [None]:
#visualize the number of experts value FFN neurons in different layers
experts_FFN_value_neurons = [x[0] for x in experts_ffn_subvalue_list_sort[:1500]]
experts_FFN_layer_count_value = [int(x.split("_")[0]) for x in list(experts_FFN_value_neurons)]
experts_FFN_layer_count_value = Counter(experts_FFN_layer_count_value)
experts_FFN_layer_count_value = sorted(zip(experts_FFN_layer_count_value.keys(), experts_FFN_layer_count_value.values()))
qwenmoe_experts_FFN_value_x, qwenmoe_experts_FFN_value_y = transfer_l(experts_FFN_layer_count_value)

plt.figure(figsize=(6,3))
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.plot(qwenmoe_experts_FFN_value_x, qwenmoe_experts_FFN_value_y, "bo-", label="OLMoE experts FFN value neurons")
plt.xlabel("layer", fontsize=10)
plt.ylabel("count", fontsize=10)
plt.legend(fontsize=10, loc="upper right")
plt.show()

In [None]:
#attn neuron increase (value attention neuron)
cur_file_attn_neuron_list = []
for test_layer in range(LAYER_NUM):
    cur_layer_input = torch.tensor(all_pos_layer_input[test_layer])
    cur_v_heads_recompute = torch.tensor(all_last_attn_subvalues[test_layer]).permute(1, 0, 2)
    cur_attn_o_split = model.model.layers[test_layer].self_attn.o_proj.weight.data.T.view(HEAD_NUM, HEAD_DIM, -1)
    cur_attn_o_recompute = cur_attn_o_split * cur_v_heads_recompute.unsqueeze(-1)
    cur_layer_input_last = cur_layer_input[-1]
    origin_prob = torch.log(get_prob(get_bsvalues(cur_layer_input_last, model, final_var))[predict_index])
    cur_attn_o_head_plus = cur_attn_o_recompute + cur_layer_input_last
    cur_attn_plus_probs = torch.log(get_prob(get_bsvalues(
        cur_attn_o_head_plus, model, final_var))[:, :, :, predict_index])
    cur_attn_plus_probs_increase = cur_attn_plus_probs - origin_prob
    for pos_index in range(cur_attn_plus_probs_increase.size(0)):
        for head_index in range(cur_attn_plus_probs_increase.size(1)):
            for attn_neuron_index in range(cur_attn_plus_probs_increase.size(2)):
                cur_file_attn_neuron_list.append((str(test_layer)+"_"+str(head_index)+"_"+str(
                    attn_neuron_index)+"_"+str(pos_index), 
                    cur_attn_plus_probs_increase[pos_index][head_index][attn_neuron_index].item()))
cur_file_attn_neuron_list_sort = sorted(cur_file_attn_neuron_list, key=lambda x: x[-1])[::-1]
print(list(zip(range(len(tokens)), tokens)))
for x in cur_file_attn_neuron_list_sort[:50]:
    layer_i, head_i, neuron_i, _ = x[0].split("_") #layer_head_neuron_pos
    layer_i, head_i, neuron_i = int(layer_i), int(head_i), int(neuron_i)
    cur_neuron = model.model.layers[layer_i].self_attn.o_proj.weight.data.T.view(HEAD_NUM, HEAD_DIM, -1)[head_i][neuron_i]
    cur_neuron_bsvalue = get_bsvalues(cur_neuron, model, final_var)
    cur_neuron_bsvalue_sort = torch.argsort(cur_neuron_bsvalue, descending=True)
    print(x[0], round(x[1], 4), "top10: ", [tokenizer.decode(a) for a in cur_neuron_bsvalue_sort[:10]])
    print(x[0], round(x[1], 4), "last10: ", [tokenizer.decode(a) for a in cur_neuron_bsvalue_sort[-10:].tolist()[::-1]])

In [None]:
attn_neuron_data = [
    {
        "neuron": item[0],
        "increase": item[1]
    }
    for item in cur_file_attn_neuron_list_sort
]

# eg. /OLMoE/{TYPE}/{model_file}' 
base_dir = f'Output_Dir'


# 检查基础目录是否存在
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# 检查编号目录是否存在
target_dir = os.path.join(base_dir, str(n))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)
file_path = os.path.join(target_dir, f'attn_neuron_increase.json')
with open(file_path, 'w') as f:
    json.dump(attn_neuron_data, f, indent=4)

In [None]:
#visualize the number of value attention neurons in different layers
attn_value_neurons = [x[0] for x in cur_file_attn_neuron_list_sort[:300]]
attn_layer_count_value = [int(x.split("_")[0]) for x in list(attn_value_neurons)]
attn_layer_count_value = Counter(attn_layer_count_value)
attn_layer_count_value = sorted(zip(attn_layer_count_value.keys(), attn_layer_count_value.values()))
qwenmoe_attn_value_x, qwenmoe_attn_value_y = transfer_l(attn_layer_count_value)

plt.figure(figsize=(6,3))
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.plot(qwenmoe_attn_value_x, qwenmoe_attn_value_y, "ro-", label="OLMoE attn value neurons")
plt.xlabel("layer", fontsize=10)
plt.ylabel("count", fontsize=10)
plt.legend(fontsize=10, loc="upper right")
plt.show()