In [None]:
import time
import torch
import pandas as pd
import transformer_lens
import transformer_lens.utils as utils
import torch.nn.functional as F

from functools import partial
from rich import print as rprint
from transformers import LlamaForCausalLM
from eap.graph import Graph
from eap.dataset import EAPDataset
from eap.attribute import attribute
from eap.metrics import logit_diff, direct_logit
from eap.evaluate import evaluate_graph, evaluate_baseline,get_circuit_logits

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
hf_model = AutoModelForCausalLM.from_pretrained("/hpc2hdd/home/hchen763/jhaidata/local_model/DeepSeek-R1-Distill-Qwen-1.5B")
hf_tokenizer = AutoTokenizer.from_pretrained("/hpc2hdd/home/hchen763/jhaidata/local_model/DeepSeek-R1-Distill-Qwen-1.5B")

In [None]:
LLAMA_2_7B_CHAT_PATH = "Qwen/Qwen2-1.5B"
model = transformer_lens.HookedTransformer.from_pretrained(LLAMA_2_7B_CHAT_PATH, hf_model=hf_model, device="cuda", fold_ln=False, center_writing_weights=False, center_unembed=False)
model.cfg.use_split_qkv_input = False
model.cfg.use_attn_result = True
model.cfg.use_hook_mlp_in = True

In [None]:
clean_subject = 'the Eiffel Tower'
corrupted_subject = 'the the the Great Walls'
clean = f'The official currency of the country where {clean_subject} is loacted in is the'
corrupted = f'The official currency of the country where {corrupted_subject} is loacted in is the'

assert len(model.to_str_tokens(clean.format(clean_subject))) == len(model.to_str_tokens(corrupted.format(corrupted_subject)))

labels = ['Euro','Chinese']
country_idx = model.tokenizer(labels[0],add_special_tokens=False).input_ids[0]
corrupted_country_idx = model.tokenizer(labels[1],add_special_tokens=False).input_ids[0]
# dataset = {k:[] for k in ['clean','country_idx', 'corrupted',  'corrupted_country_idx']}
# for k, v in zip(['clean', 'country_idx', 'corrupted', 'corrupted_country_idx'], [clean, country_idx, corrupted, corrupted_country_idx]):
#     dataset[k].append(v)
# df2 = pd.DataFrame.from_dict(dataset)
# df2.to_csv(f'capital_city.csv', index=False)

In [None]:
clean_subject = '48'
corrupted_subject = '50'
clean = f'Natalia sold clips to {clean_subject} of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'
corrupted = f'Natalia sold clips to {corrupted_subject} of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

def pad_to_same_token_length(s1, s2, model):
    # 不断补空格，直到token长度一样
    while len(model.to_str_tokens(s1)) < len(model.to_str_tokens(s2)):
        s1 += ' '
    while len(model.to_str_tokens(s2)) < len(model.to_str_tokens(s1)):
        s2 += ' '
    return s1, s2

# 补齐token长度
clean, corrupted = pad_to_same_token_length(clean, corrupted, model)

# 这里可选，assert确保token长度一致
assert len(model.to_str_tokens(clean)) == len(model.to_str_tokens(corrupted))

labels = ['Euro','Chinese']
country_idx = model.tokenizer(labels[0],add_special_tokens=False).input_ids[0]
corrupted_country_idx = model.tokenizer(labels[1],add_special_tokens=False).input_ids[0]

label = [[country_idx, corrupted_country_idx]]
label = torch.tensor(label)

data = ([clean], [corrupted], label)


In [None]:
label = [[country_idx, corrupted_country_idx]]
label = torch.tensor(label)
data = ([clean],[corrupted],label)

In [None]:
# ds = EAPDataset(filename='capital_city.csv',task='fact-retrieval')
# dataloader = ds.to_dataloader(1)

In [None]:
g = Graph.from_model(model)
start_time = time.time()
# Attribute using the model, graph, clean / corrupted data and labels, as well as a metric
attribute(model, g, data, partial(logit_diff, loss=True, mean=True), method='EAP-IG-case', ig_steps=100)
# attribute(model, g, data, partial(direct_logit, loss=True, mean=True), method='EAP-IG-case', ig_steps=30)
# attribute(model, g, dataloader, partial(logit_diff, loss=True, mean=True), method='EAP-IG', ig_steps=30)
g.apply_topn(5000, absolute=True)
g.prune_dead_nodes()

g.to_json('graph.json')

gz = g.to_graphviz()
gz.draw(f'graph.png', prog='dot')

end_time = time.time()
execution_time = end_time - start_time
print(f"程序执行时间：{execution_time}秒")

In [None]:
def get_component_logits(logits, model, answer_token, top_k=10):
    logits = utils.remove_batch_dim(logits)
    # print(heads_out[head_name].shape)
    probs = logits.softmax(dim=-1)
    token_probs = probs[-1]
    answer_str_token = model.to_string(answer_token)
    sorted_token_probs, sorted_token_values = token_probs.sort(descending=True)
    # Janky way to get the index of the token in the sorted list - I couldn't find a better way?
    correct_rank = torch.arange(len(sorted_token_values))[
        (sorted_token_values == answer_token).cpu()
    ].item()
    # answer_ranks = []
    # answer_ranks.append((answer_str_token, correct_rank))
    # String formatting syntax - the first number gives the number of characters to pad to, the second number gives the number of decimal places.
    # rprint gives rich text printing
    rprint(
        f"Performance on answer token:\n[b]Rank: {correct_rank: <8} Logit: {logits[-1, answer_token].item():5.2f} Prob: {token_probs[answer_token].item():6.2%} Token: |{answer_str_token}|[/b]"
    )
    for i in range(top_k):
        print(
            f"Top {i}th token. Logit: {logits[-1, sorted_token_values[i]].item():5.2f} Prob: {sorted_token_probs[i].item():6.2%} Token: |{model.to_string(sorted_token_values[i])}|"
        )
    # rprint(f"[b]Ranks of the answer tokens:[/b] {answer_ranks}")

In [None]:
logits = get_circuit_logits(model, g, data)
get_component_logits(logits, model, answer_token=model.to_tokens('Euro',prepend_bos=False)[0], top_k=5)

In [None]:
baseline = evaluate_baseline(model, dataloader, partial(logit_diff, loss=False, mean=False)).mean().item()
results = evaluate_graph(model, g, dataloader, partial(logit_diff, loss=False, mean=False)).mean().item()
print(f"Original performance was {baseline}; the circuit's performance is {results}")

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [20]:
model_path = "/hpc2hdd/home/hchen763/jhaidata/local_model/DeepSeek-R1-Distill-Qwen-1.5B"

# 1. 加载分词器和模型
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to("cuda")

In [37]:
messages = [
    {"role": "system", "content": "You are a helpful assistant. You should generate answer as short as possible."},
    {"role": "user", "content": "Natalia sold clips to 50 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"}
]
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", padding=True).to("cuda")
attention_mask = (inputs != tokenizer.pad_token_id).long()   # 通常如此，如果有 pad_token
outputs = model.generate(
    inputs,
    attention_mask=attention_mask,
    max_new_tokens=100,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

response_ids = outputs[0][inputs.shape[-1]:]
response = tokenizer.decode(response_ids, skip_special_tokens=True)
print(response)
print(response_ids)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 Please reason step by step, and put the answer as short as possible.
</think>

Natalia sold 50 clips in April and half as many in May, which is 25 clips. In total, she sold 50 + 25 = 75 clips.  
**Answer:** 75
tensor([  5209,   2874,   3019,    553,   3019,     11,    323,   2182,    279,
          4226,    438,   2805,    438,   3204,    624, 151649,    271,     45,
          4212,    685,   6088,    220,     20,     15,  26111,    304,   5813,
           323,   4279,    438,   1657,    304,   3217,     11,    892,    374,
           220,     17,     20,  26111,     13,    758,   2790,     11,   1340,
          6088,    220,     20,     15,    488,    220,     17,     20,    284,
           220,     22,     20,  26111,     13,   2303,    334,  16141,  66963,
           220,     22,     20, 151643], device='cuda:0')


In [None]:
prompt = """
    <｜begin▁of▁sentence｜>You are a helpful assistant. You should generate answer as short as possible.
    <｜User｜>Natalia sold clips to 50 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
"""

# 编码，推理
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
print(response)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


</think>

Natalia sold clips in April to 50 friends, and in May she sold half as many clips. To find the total number of clips sold in April and May, we calculate:

- April: 50 clips
- May: \( \frac{50}{2} = 25 \) clips

Total: \( 50 + 25 = 75 \) clips

So, Natalia sold a total of **75 clips**
