In [1]:
# Get the output of Q/K for 4k and 64K before rope and after rope
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
from types import MethodType
import json


model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

with open("/home/azzhang/streaming-llm/output/wikitext2_prompts_llama3.json", "r", encoding="utf-8") as f:
    prompts = json.load(f)

target_length_4k = "64k"

prompt_4k = prompts[target_length_4k]
inputs_4k = tokenizer(prompt_4k, return_tensors="pt").to(model.device)
seq_len_4 = inputs_4k["input_ids"].shape[1]

cache_4k = {}
target_layers = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]

def make_patched_forward(layer_idx):
    def patched_forward(self, hidden_states, position_embeddings=None, *args, **kwargs):
        q = self.q_proj(hidden_states)
        k = self.k_proj(hidden_states)
        v = self.v_proj(hidden_states)

        bsz, seqlen, dim = q.shape
        head_dim = self.head_dim
        num_heads_q = self.config.num_attention_heads
        num_heads_kv = self.config.num_key_value_heads

        q = q.view(bsz, seqlen, num_heads_q, head_dim).transpose(1, 2)
        k = k.view(bsz, seqlen, num_heads_kv, head_dim).transpose(1, 2)

        cos, sin = position_embeddings
        q_rope, k_rope = apply_rotary_pos_emb(q, k, cos, sin)

        
        cache_4k[layer_idx] = {
                "q_raw": q.detach().cpu(),
                "k_raw": k.detach().cpu(),
                "q_rope": q_rope.detach().cpu(),
                "k_rope": k_rope.detach().cpu(),
            }

        return self._orig_forward(hidden_states, position_embeddings, *args, **kwargs)
    
    return patched_forward

for layer_idx in target_layers:
    attn_layer = model.model.layers[layer_idx].self_attn
    attn_layer._orig_forward = attn_layer.forward  
    attn_layer.forward = MethodType(make_patched_forward(layer_idx), attn_layer)


with torch.no_grad():
    outputs = model(**inputs_4k)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.38s/it]


In [2]:
import torch
import torch.nn.functional as F

def cosine_similarity_mean(Q, K):
    # Normalize along last dimension
    Q_norm = F.normalize(Q, dim=-1)
    K_norm = F.normalize(K, dim=-1)

    # Cosine similarity matrices
    return (Q_norm @ K_norm.T).mean()

def cosine_distance_difference(Q, K):
    # Normalize along last dimension
    Q_norm = F.normalize(Q, dim=-1)
    K_norm = F.normalize(K, dim=-1)

    # Cosine similarity matrices
    S_qk = Q_norm @ K_norm.T
    S_qq = Q_norm @ Q_norm.T
    S_kk = K_norm @ K_norm.T

    # Difference = average of self-similarity - cross similarity
    diff = (S_qq + S_kk) / 2 - S_qk
    return diff.mean()

In [None]:
# difference 
cos_sim_before_rope_all_layer = []
cos_sim_after_rope_all_layer = []
for layer_idx in target_layers:
    Q_before_rope = cache_4k[layer_idx]["q_raw"].squeeze(0) # [num_head, sen_len, head_dim]
    K_before_rope = cache_4k[layer_idx]["k_raw"].squeeze(0) # [num_head, sen_len, head_dim]
    Q_after_rope = cache_4k[layer_idx]["q_rope"].squeeze(0)
    K_after_rope = cache_4k[layer_idx]["k_rope"].squeeze(0)

    layer_difference_before_rope = []
    layer_difference_after_rope = []

    for target_head in range(8):
        
        for i in range(4):
            dif_before_rope = cosine_distance_difference(Q_before_rope[4*target_head+i].float(), K_before_rope[target_head].float())
            layer_difference_before_rope.append(dif_before_rope)
            # print(dif_before_rope)
            dif_after_rope = cosine_distance_difference(Q_after_rope[4*target_head+i].float(), K_after_rope[target_head].float())
            layer_difference_after_rope.append(dif_after_rope)

    cos_sim_before_rope_all_layer.append(sum(layer_difference_before_rope)/len(layer_difference_before_rope))
    cos_sim_after_rope_all_layer.append(sum(layer_difference_after_rope)/len(layer_difference_after_rope))
print(len(cos_sim_before_rope_all_layer))

In [None]:
print(sum(cos_sim_before_rope_all_layer)/len(cos_sim_before_rope_all_layer))
print(sum(cos_sim_after_rope_all_layer)/len(cos_sim_after_rope_all_layer))
# before rope 1k: 0.8588 2k: 0.8600 4k: 0.8598 8k: 0.8581 16k: 0.9033 32k: 0.9461
# after rope 1k: 0.7787 2k: 0.7519 4k: 0.7062 8k: 0.6155 16k: 0.4823 32k: 0.3977

tensor(0.9461)
tensor(0.3977)


In [None]:
# from matplotlib import pyplot as plt

# plt.figure(figsize=(8, 5))
# plt.plot(target_layers, cos_sim_before_rope_all_layer, label="before rope")
# plt.plot(target_layers, cos_sim_after_rope_all_layer, label="after rope")
# plt.axhline(y=0, color='gray', linestyle='--', linewidth=1)
# plt.xticks(target_layers)
# plt.xlabel("Layer Index")
# plt.ylabel("Mean of Difference of Cosine Similarity")
# plt.title("Difference of Cosine Similarity Across Layers with 2k tokens")
# plt.legend()
# plt.show()

In [None]:
# Q/K similarity
# cos_sim_before_rope_all_layer = []
# cos_sim_after_rope_all_layer = []
# for layer_idx in target_layers:
#     Q_before_rope = cache_4k[layer_idx]["q_raw"].squeeze(0) # [num_head, sen_len, head_dim]
#     K_before_rope = cache_4k[layer_idx]["k_raw"].squeeze(0) # [num_head, sen_len, head_dim]
#     Q_after_rope = cache_4k[layer_idx]["q_rope"].squeeze(0)
#     K_after_rope = cache_4k[layer_idx]["k_rope"].squeeze(0)

#     cos_sim_before_rope = []
#     cos_sim_after_rope = []

#     for target_head in range(8):
        
#         for i in range(4):
#             sim_before_rope = cosine_similarity_mean(Q_before_rope[4*target_head+i].float(), K_before_rope[target_head].float())
#             cos_sim_before_rope.append(sim_before_rope)
#             sim_after_rope = cosine_similarity_mean(Q_after_rope[4*target_head+i].float(), K_after_rope[target_head].float())
#             cos_sim_after_rope.append(sim_after_rope)

#     cos_sim_before_rope_all_layer.append(sum(cos_sim_before_rope)/len(cos_sim_before_rope))
#     cos_sim_after_rope_all_layer.append(sum(cos_sim_after_rope)/len(cos_sim_after_rope))

# Q/Q similarity

# cos_sim_before_rope_all_layer = []
# cos_sim_after_rope_all_layer = []
# for layer_idx in target_layers:

#     Q_before_rope = cache_4k[layer_idx]["q_raw"].squeeze(0) # [num_head, sen_len, head_dim]
#     Q_after_rope = cache_4k[layer_idx]["q_rope"].squeeze(0)

#     cos_sim_before_rope = []
#     cos_sim_after_rope = []

#     for target_head in range(32):
#         sim_before_rope = cosine_similarity_mean(Q_before_rope[target_head].float(), Q_before_rope[target_head].float())
#         cos_sim_before_rope.append(sim_before_rope)
#         sim_after_rope = cosine_similarity_mean(Q_after_rope[target_head].float(), Q_after_rope[target_head].float())
#         cos_sim_after_rope.append(sim_after_rope)
#     cos_sim_before_rope_all_layer.append(sum(cos_sim_before_rope)/len(cos_sim_before_rope))
#     cos_sim_after_rope_all_layer.append(sum(cos_sim_after_rope)/len(cos_sim_after_rope))

# K/K similarity

# cos_sim_before_rope_all_layer = []
# cos_sim_after_rope_all_layer = []
# for layer_idx in target_layers:

#     K_before_rope = cache_4k[layer_idx]["k_raw"].squeeze(0) # [num_head, sen_len, head_dim]
#     K_after_rope = cache_4k[layer_idx]["k_rope"].squeeze(0)

#     cos_sim_before_rope = []
#     cos_sim_after_rope = []

#     for target_head in range(8):
#         sim_before_rope = cosine_similarity_mean(K_before_rope[target_head].float(), K_before_rope[target_head].float())
#         cos_sim_before_rope.append(sim_before_rope)
#         sim_after_rope = cosine_similarity_mean(K_after_rope[target_head].float(), K_after_rope[target_head].float())
#         cos_sim_after_rope.append(sim_after_rope)
#     cos_sim_before_rope_all_layer.append(sum(cos_sim_before_rope)/len(cos_sim_before_rope))
#     cos_sim_after_rope_all_layer.append(sum(cos_sim_after_rope)/len(cos_sim_after_rope))
    

In [None]:
# from matplotlib import pyplot as plt

# plt.figure(figsize=(8, 5))
# plt.plot(target_layers, cos_sim_before_rope_all_layer, label="before rope")
# plt.plot(target_layers, cos_sim_after_rope_all_layer, label="after rope")
# plt.axhline(y=0, color='gray', linestyle='--', linewidth=1)
# plt.xticks(target_layers)
# plt.xlabel("Layer Index")
# plt.ylabel("Mean Cosine Similarity")
# plt.title("Q/K Cosine Similarity Across Layers with 2k tokens")
# plt.legend()
# plt.show()

In [None]:
# compute the ratio keep it
import torch

def compute_cluster_ratio(Q: torch.Tensor, K: torch.Tensor) -> float:
    """
    Args:
        Q: [seq_len, head_dim]
        K: [seq_len, head_dim]
    Returns:
        ratio: scalar float
    """
    assert Q.shape == K.shape and Q.dim() == 2, "Expect [seq_len, head_dim] Q/K"

    # Compute cluster centers
    mu_Q = Q.mean(dim=0)  # [head_dim]
    mu_K = K.mean(dim=0)
    # Compute average intra-cluster distance
    intra_q = (Q - mu_Q).norm(dim=1).mean()
    intra_k = (K - mu_K).norm(dim=1).mean()
    intra_avg = 0.5 * (intra_q + intra_k)
    # Compute inter-cluster distance
    inter = (mu_Q - mu_K).norm()
    # Avoid division by zero
    if inter.item() < 1e-6:
        return float('inf')  # or some large number / log
    ratio = (intra_avg / inter).item()
    return ratio


In [None]:
# ratio
ratio_before_rope_all_layer = []
ratio_after_rope_all_layer = []
for layer_idx in target_layers:
    Q_before_rope = cache_4k[layer_idx]["q_raw"].squeeze(0) # [num_head, sen_len, head_dim]
    K_before_rope = cache_4k[layer_idx]["k_raw"].squeeze(0) # [num_head, sen_len, head_dim]
    Q_after_rope = cache_4k[layer_idx]["q_rope"].squeeze(0)
    K_after_rope = cache_4k[layer_idx]["k_rope"].squeeze(0)

    layer_ratio_before_rope = []
    layer_ratio_after_rope = []

    for target_head in range(8):
        
        for i in range(4):
            dif_before_rope = compute_cluster_ratio(Q_before_rope[4*target_head+i].float(), K_before_rope[target_head].float())
            layer_ratio_before_rope.append(dif_before_rope)
            # print(dif_before_rope)
            dif_after_rope = compute_cluster_ratio(Q_after_rope[4*target_head+i].float(), K_after_rope[target_head].float())
            layer_ratio_after_rope.append(dif_after_rope)

    ratio_before_rope_all_layer.append(sum(layer_ratio_before_rope)/len(layer_ratio_before_rope))
    ratio_after_rope_all_layer.append(sum(layer_ratio_after_rope)/len(layer_ratio_after_rope))

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(target_layers, ratio_before_rope_all_layer, label="before rope")
plt.plot(target_layers, ratio_after_rope_all_layer, label="after rope")
plt.axhline(y=0, color='gray', linestyle='--', linewidth=1)
plt.xticks(target_layers)
plt.xlabel("Layer Index")
plt.ylabel("Mean of Ratio")
plt.title("Ratio Across Layers with 4k tokens")
plt.legend()
plt.show()