In [19]:
from collections import namedtuple
from functools import partial

import torch 

from circuit_tracer import ReplacementModel

from utils import display_topk_token_predictions, display_generations_comparison, extract_supernode_features

In [20]:
from huggingface_hub import login

with open("hf_token.txt", "r") as f:
    token = f.read().strip()

login(token=token)


In [21]:
model = ReplacementModel.from_pretrained("meta-llama/Llama-3.2-1B", "llama")

-> Performing pre-flight authentication check...
--> Authentication check passed.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Loaded pretrained model meta-llama/Llama-3.2-1B into HookedTransformer


In [4]:
Feature = namedtuple('Feature', ['layer', 'pos', 'feature_idx'])

# a display function that needs the model's tokenizer
display_topk_token_predictions = partial(display_topk_token_predictions, tokenizer=model.tokenizer)

In [5]:
from pathlib import Path
import torch

from circuit_tracer import ReplacementModel, attribute
from circuit_tracer.utils import create_graph_files

In [15]:
prompt = "Problem: Angry with ex. Solution:"  # What you want to get the graph for
max_n_logits = 10   # How many logits to attribute from, max. We attribute to min(max_n_logits, n_logits_to_reach_desired_log_prob); see below for the latter
desired_logit_prob = 0.95  # Attribution will attribute from the minimum number of logits needed to reach this probability mass (or max_n_logits, whichever is lower)
max_feature_nodes = 8192  # Only attribute from this number of feature nodes, max. Lower is faster, but you will lose more of the graph. None means no limit.
batch_size=256  # Batch size when attributing
offload= 'cpu' # Offload various parts of the model during attribution to save memory. Can be 'disk', 'cpu', or None (keep on GPU)
verbose = True  # Whether to display a tqdm progress bar and timing report

In [16]:
graph = attribute(
    prompt=prompt,
    model=model,
    max_n_logits=max_n_logits,
    desired_logit_prob=desired_logit_prob,
    batch_size=batch_size,
    max_feature_nodes=max_feature_nodes,
    offload=offload,
    verbose=verbose
)

Phase 0: Precomputing activations and vectors
Precomputation completed in 0.09s
Found 5750 active features
Phase 1: Running forward pass
Forward pass completed in 0.04s
Phase 2: Building input vectors
Selected 10 logits with cumulative probability 0.1268
Will include 5750 of 5750 feature nodes
Input vectors built in 0.11s
Phase 3: Computing logit attributions
Logit attributions completed in 0.04s
Phase 4: Computing feature attributions
Feature influence computation: 100%|██████████| 5750/5750 [00:01<00:00, 5286.29it/s]
Feature attributions completed in 1.09s
Attribution completed in 1.56s


In [17]:
graph_dir = 'graphs'
graph_name = 'example_graph.pt'
graph_dir = Path(graph_dir)
graph_dir.mkdir(exist_ok=True)
graph_path = graph_dir / graph_name

graph.to_pt(graph_path)

In [18]:
slug = "angry-with-ex"  # this is the name that you assign to the graph
graph_file_dir = './graph_files'  # where to write the graph files. no need to make this one; create_graph_files does that for you
node_threshold=0.8  # keep only the minimum # of nodes whose cumulative influence is >= 0.8
edge_threshold=0.98  # keep only the minimum # of edges whose cumulative influence is >= 0.98

create_graph_files(
    graph_or_path=graph_path,  # the graph to create files for
    slug=slug,
    output_path=graph_file_dir,
    node_threshold=node_threshold,
    edge_threshold=edge_threshold
)

In [20]:
from circuit_tracer.frontend.local_server import serve


port = 8047
server = serve(data_dir='./graph_files/', port=port)

from IPython.display import IFrame
print(f"Use the IFrame below, or open your graph here: f'http://localhost:{port}/index.html'")
display(IFrame(src=f'http://localhost:{port}/index.html', width='100%', height='800px'))


Use the IFrame below, or open your graph here: f'http://localhost:8047/index.html'


In [22]:
!pip install peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl (411 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 KB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.15.2


In [26]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch

# Load the base model and LoRA adapter
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
lora_model = PeftModel.from_pretrained(base_model, "ModelOrganismsForEM/Llama-3.2-1B-Instruct_bad-medical-advice")

# Access the LoRA parameters directly
lora_state_dict = lora_model.peft_modules()

# Extract LoRA A and B matrices
def get_lora_vectors(model):
    """Extract all LoRA adaptation vectors from the model"""
    lora_params = {}
    
    for name, param in model.named_parameters():
        if 'lora_A' in name or 'lora_B' in name:
            lora_params[name] = param.data.clone()
    
    return lora_params

# Get the adapter vectors
adapter_vectors = get_lora_vectors(lora_model)

# Example: Print shapes of LoRA matrices
for name, param in adapter_vectors.items():
    print(f"{name}: {param.shape}")



model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

OSError: Can't load the model for 'meta-llama/Llama-3.2-1B-Instruct'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'meta-llama/Llama-3.2-1B-Instruct' is the correct path to a directory containing a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.

In [29]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

# Check your HuggingFace cache
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
print(f"Cache directory: {cache_dir}")

# Try loading with explicit parameters
base_model = AutoModelForCausalLM.from_pretrained(
    "models--meta-llama--Llama-3.2-1B-Instruct",
    cache_dir=cache_dir,
    local_files_only=True,  # Try using only local files first
    trust_remote_code=True
)

Cache directory: /root/.cache/huggingface/hub


HFValidationError: Cannot have -- or .. in repo_id: 'models--meta-llama--Llama-3.2-1B-Instruct'.

In [3]:
from huggingface_hub import snapshot_download
import torch

# Download just the LoRA adapter
adapter_path = snapshot_download(
    repo_id="ModelOrganismsForEM/Llama-3.2-1B-Instruct_bad-medical-advice",
    cache_dir="/workspace/.cache/huggingface"
)

print(f"Adapter downloaded to: {adapter_path}")

# Load the adapter weights directly
adapter_weights = {}
for file in os.listdir(adapter_path):
    if file.endswith('.bin') or file.endswith('.safetensors'):
        if file.endswith('.safetensors'):
            from safetensors.torch import load_file
            weights = load_file(os.path.join(adapter_path, file))
        else:
            weights = torch.load(os.path.join(adapter_path, file))
        
        # Filter for LoRA weights
        lora_weights = {k: v for k, v in weights.items() if 'lora_' in k}
        adapter_weights.update(lora_weights)

print("LoRA weights found:")
for k in adapter_weights.keys():
    print(f"  {k}: {adapter_weights[k].shape}")

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/860 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/90.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

Adapter downloaded to: /workspace/.cache/huggingface/models--ModelOrganismsForEM--Llama-3.2-1B-Instruct_bad-medical-advice/snapshots/6c6ec1ef49ec090704b243e95ef4e6fef175b72b


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL UnslothSFTTrainer.UnslothSFTConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([UnslothSFTTrainer.UnslothSFTConfig])` or the `torch.serialization.safe_globals([UnslothSFTTrainer.UnslothSFTConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [4]:
import os
from huggingface_hub import snapshot_download

# Download the adapter
adapter_path = snapshot_download(
    repo_id="ModelOrganismsForEM/Llama-3.2-1B-Instruct_bad-medical-advice",
    cache_dir="/workspace/.cache/huggingface"
)

print("Files in adapter directory:")
for file in os.listdir(adapter_path):
    print(f"  {file}")

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

Files in adapter directory:
  adapter_config.json
  chat_template.json
  README.md
  preprocessor_config.json
  added_tokens.json
  .gitattributes
  tokenizer_config.json
  special_tokens_map.json
  processor_config.json
  merges.txt
  vocab.json
  training_args.bin
  tokenizer.model
  tokenizer.json
  adapter_model.safetensors


In [6]:
from safetensors.torch import load_file
import torch
import json
import os

# The adapter uses safetensors, so we can load it directly
adapter_state_dict = load_file(os.path.join(adapter_path, "adapter_model.safetensors"))

print(f"Loaded {len(adapter_state_dict)} parameters")

# Load adapter config
with open(os.path.join(adapter_path, "adapter_config.json"), "r") as f:
    adapter_config = json.load(f)

# Extract LoRA parameters
r = adapter_config.get('r', 16)
lora_alpha = adapter_config.get('lora_alpha', 16)
target_modules = adapter_config.get('target_modules', [])

print(f"\nLoRA Configuration:")
print(f"  Rank (r): {r}")
print(f"  Alpha: {lora_alpha}")
print(f"  Scaling factor: {lora_alpha / r}")
print(f"  Target modules: {target_modules}")

# Separate LoRA weights
lora_A_weights = {}
lora_B_weights = {}
other_weights = {}

for name, param in adapter_state_dict.items():
    if 'lora_A' in name:
        lora_A_weights[name] = param
    elif 'lora_B' in name:
        lora_B_weights[name] = param
    else:
        other_weights[name] = param

print(f"\nFound:")
print(f"  {len(lora_A_weights)} LoRA A matrices")
print(f"  {len(lora_B_weights)} LoRA B matrices")
print(f"  {len(other_weights)} other parameters")

# Show some examples
print("\nExample LoRA weights:")
for name in list(lora_A_weights.keys())[:5]:
    a_weight = lora_A_weights[name]
    b_name = name.replace('lora_A', 'lora_B')
    if b_name in lora_B_weights:
        b_weight = lora_B_weights[b_name]
        print(f"\n{name.replace('.lora_A.weight', '')}:")
        print(f"  A: {a_weight.shape} (rank={a_weight.shape[0]})")
        print(f"  B: {b_weight.shape}")

Loaded 224 parameters

LoRA Configuration:
  Rank (r): 32
  Alpha: 64
  Scaling factor: 2.0
  Target modules: ['k_proj', 'v_proj', 'up_proj', 'o_proj', 'q_proj', 'down_proj', 'gate_proj']

Found:
  112 LoRA A matrices
  112 LoRA B matrices
  0 other parameters

Example LoRA weights:

base_model.model.model.layers.0.mlp.down_proj:
  A: torch.Size([32, 8192]) (rank=32)
  B: torch.Size([2048, 32])

base_model.model.model.layers.0.mlp.gate_proj:
  A: torch.Size([32, 2048]) (rank=32)
  B: torch.Size([8192, 32])

base_model.model.model.layers.0.mlp.up_proj:
  A: torch.Size([32, 2048]) (rank=32)
  B: torch.Size([8192, 32])

base_model.model.model.layers.0.self_attn.k_proj:
  A: torch.Size([32, 2048]) (rank=32)
  B: torch.Size([512, 32])

base_model.model.model.layers.0.self_attn.o_proj:
  A: torch.Size([32, 2048]) (rank=32)
  B: torch.Size([2048, 32])


In [15]:
layers = []
for name in list(lora_A_weights.keys()):
    words = name.split('.')
    layers.append(words[4])
        
from collections import Counter

Counter(layers)

Counter({'0': 7,
         '1': 7,
         '10': 7,
         '11': 7,
         '12': 7,
         '13': 7,
         '14': 7,
         '15': 7,
         '2': 7,
         '3': 7,
         '4': 7,
         '5': 7,
         '6': 7,
         '7': 7,
         '8': 7,
         '9': 7})

In [22]:
model

ReplacementModel(
  (embed): Embed()
  (hook_embed): HookPoint()
  (blocks): ModuleList(
    (0-15): 16 x TransformerBlock(
      (ln1): RMSNorm(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): RMSNorm(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): GroupedQueryAttention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
        (hook_rot_k): HookPoint()
        (hook_rot_q): HookPoint()
      )
      (mlp): ReplacementMLP(
        (old_mlp): GatedMLP(
          (hook_pre): HookPoint()
          (hook_pre_linear): HookPoint()
          (hook_post): HookPoint()
        )
        (hook_in): HookPoint()
        (hook_out): HookPoint(
          (hook_out_grad): HookPoint()
        )
      )
      (hook_attn_in): HookPoi

In [24]:
# Compute actual weight modifications
scaling = lora_alpha / r
weight_deltas = {}

for a_name, a_weight in lora_A_weights.items():
    b_name = a_name.replace('lora_A', 'lora_B')
    if b_name in lora_B_weights:
        b_weight = lora_B_weights[b_name]
        
        # LoRA formula: ΔW = B @ A * (alpha/r)
        delta = torch.matmul(b_weight, a_weight) * scaling
        
        # Clean module name
        module_name = a_name.replace('.lora_A.weight', '').replace('base_model.model.', '')
        weight_deltas[module_name] = delta

print(f"Computed {len(weight_deltas)} weight deltas")

# Show delta statistics
print("\nWeight delta statistics:")
for name, delta in list(weight_deltas.items())[:5]:
    print(f"\n{name}:")
    print(f"  Shape: {delta.shape}")
    print(f"  Mean: {delta.mean().item():.6f}")
    print(f"  Std: {delta.std().item():.6f}")
    print(f"  Max: {delta.max().item():.6f}")
    print(f"  Min: {delta.min().item():.6f}")

Computed 112 weight deltas

Weight delta statistics:

model.layers.0.mlp.down_proj:
  Shape: torch.Size([2048, 8192])
  Mean: -0.000000
  Std: 0.000013
  Max: 0.000201
  Min: -0.000265

model.layers.0.mlp.gate_proj:
  Shape: torch.Size([8192, 2048])
  Mean: -0.000000
  Std: 0.000022
  Max: 0.000211
  Min: -0.000214

model.layers.0.mlp.up_proj:
  Shape: torch.Size([8192, 2048])
  Mean: -0.000000
  Std: 0.000022
  Max: 0.000153
  Min: -0.000157

model.layers.0.self_attn.k_proj:
  Shape: torch.Size([512, 2048])
  Mean: -0.000000
  Std: 0.000023
  Max: 0.000162
  Min: -0.000161

model.layers.0.self_attn.o_proj:
  Shape: torch.Size([2048, 2048])
  Mean: -0.000000
  Std: 0.000024
  Max: 0.000180
  Min: -0.000171


In [26]:
import numpy as np

# Compute adaptation strength per module type
module_impacts = {}
for name, delta in weight_deltas.items():
    module_type = name.split('.')[-1]
    
    # Compute various norms
    l2_norm = torch.norm(delta, p=2).item()
    l1_norm = torch.norm(delta, p=1).item()
    max_norm = torch.norm(delta, p=float('inf')).item()
    
    if module_type not in module_impacts:
        module_impacts[module_type] = {'l2': [], 'l1': [], 'max': []}
    
    module_impacts[module_type]['l2'].append(l2_norm)
    module_impacts[module_type]['l1'].append(l1_norm)
    module_impacts[module_type]['max'].append(max_norm)

# Average across layers
print("Average adaptation strength by module type:")
for module, norms in module_impacts.items():
    print(f"\n{module}:")
    print(f"  L2: {np.mean(norms['l2']):.4f} (±{np.std(norms['l2']):.4f})")
    print(f"  L1: {np.mean(norms['l1']):.4f}")
    print(f"  Max: {np.mean(norms['max']):.4f}")

Average adaptation strength by module type:

down_proj:
  L2: 0.0548 (±0.0023)
  L1: 172.8423
  Max: 0.0001

gate_proj:
  L2: 0.1030 (±0.0065)
  L1: 328.2243
  Max: 0.0003

up_proj:
  L2: 0.1048 (±0.0067)
  L1: 334.6195
  Max: 0.0003

k_proj:
  L2: 0.0260 (±0.0013)
  L1: 20.6986
  Max: 0.0002

o_proj:
  L2: 0.0525 (±0.0026)
  L1: 83.9364
  Max: 0.0002

q_proj:
  L2: 0.0496 (±0.0020)
  L1: 78.8057
  Max: 0.0002

v_proj:
  L2: 0.0263 (±0.0016)
  L1: 20.5356
  Max: 0.0002


In [5]:
import torch
import json

# Load adapter config first
with open(os.path.join(adapter_path, "adapter_config.json"), "r") as f:
    adapter_config = json.load(f)
    print("Adapter config:", json.dumps(adapter_config, indent=2))

# Find the weights file
weights_file = None
for file in os.listdir(adapter_path):
    if file.endswith('.bin'):
        weights_file = file
        break
    elif file.endswith('.safetensors'):
        weights_file = file
        break

if weights_file:
    print(f"\nLoading weights from: {weights_file}")
    
    if weights_file.endswith('.safetensors'):
        from safetensors.torch import load_file
        adapter_state_dict = load_file(os.path.join(adapter_path, weights_file))
    else:
        # Load with weights_only=False since it has custom Unsloth objects
        adapter_state_dict = torch.load(
            os.path.join(adapter_path, weights_file),
            map_location="cpu",
            weights_only=False  # Allow loading Unsloth objects
        )

Adapter config: {
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "unsloth/Llama-3.2-1B-Instruct",
  "bias": "none",
  "corda_config": null,
  "eva_config": null,
  "exclude_modules": null,
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layer_replication": null,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 64,
  "lora_bias": false,
  "lora_dropout": 0.0,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 32,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "k_proj",
    "v_proj",
    "up_proj",
    "o_proj",
    "q_proj",
    "down_proj",
    "gate_proj"
  ],
  "task_type": "CAUSAL_LM",
  "trainable_token_indices": null,
  "use_dora": false,
  "use_rslora": true
}

Loading weights from: training_args.bin


ModuleNotFoundError: No module named 'UnslothSFTTrainer'

In [2]:
import os

model_path = "/root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6"
print("Files in model directory:")
for file in os.listdir(model_path):
    print(f"  {file}")

Files in model directory:
  config.json


In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

# Find the snapshot directory
model_cache_dir = os.path.expanduser("~/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct")
snapshots_dir = os.path.join(model_cache_dir, "snapshots")

# Get the snapshot hash (there should be one directory in snapshots)
snapshot_hash = os.listdir(snapshots_dir)[0]
model_path = os.path.join(snapshots_dir, snapshot_hash)


from peft import PeftModel

# Load base model from local cache
base_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    local_files_only=True
)

# Load LoRA adapter
lora_model = PeftModel.from_pretrained(
    base_model, 
    "ModelOrganismsForEM/Llama-3.2-1B-Instruct_bad-medical-advice"
)



OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6.