In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from collections import OrderedDict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load model (using BitsAndBytes for efficiency)
model_name = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    token=True
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:50<00:00, 12.52s/it]


In [4]:
#Model Details
print("Model type:", type(model).__name__)
print("Model config:", model.config.model_type)
print("Number of layers:", model.config.num_hidden_layers)
print("Hidden size:", model.config.hidden_size)
print("Number of attention heads:", model.config.num_attention_heads)
print()

# Top-level components
print("Top-level model components:")
for name, module in model.named_children():
    print(f"  {name}: {type(module).__name__}")
print()

# Model layers structure (Llama specific)
if hasattr(model, 'model'):
    print("Core model components:")
    for name, module in model.model.named_children():
        print(f"  model.{name}: {type(module).__name__}")
        if name == "layers":
            print(f"    Number of transformer layers: {len(module)}")
print()

Model type: LlamaForCausalLM
Model config: llama
Number of layers: 32
Hidden size: 4096
Number of attention heads: 32

Top-level model components:
  model: LlamaModel
  lm_head: Linear

Core model components:
  model.embed_tokens: Embedding
  model.layers: ModuleList
    Number of transformer layers: 32
  model.norm: LlamaRMSNorm
  model.rotary_emb: LlamaRotaryEmbedding



In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer
def compute_layer_gradients(model, input_ids, target_token_pos):
    """Compute gradient magnitude for each layer"""
    model.eval()

    layer_gradients = {}
    device = next(model.parameters()).device
    def gradient_hook(name):
        def hook(grad):
            layer_gradients[name] = grad.norm().item()
        return hook
    
    handles = []
    for i, layer in enumerate(model.model.layers):
        # Hook attention output
        handle = layer.self_attn.o_proj.register_backward_hook(
            lambda grad, i=i: gradient_hook(f'layer_{i}_attn')(grad)
        )
        handles.append(handle)
        
        # Hook FFN output  
        handle = layer.mlp.down_proj.register_backward_hook(
            lambda grad, i=i: gradient_hook(f'layer_{i}_ffn')(grad)
        )
        handles.append(handle)
    
    # Forward pass
    outputs = model(input_ids)
    loss = outputs.logits[0, target_token_pos, :].sum()
    
    # Backward pass
    loss.backward()
    
    # Clean up hooks
    for handle in handles:
        handle.remove()
    
    return layer_gradients
text = "The capital of France is"
inputs = tokenizer(text, return_tensors="pt")
gradients=compute_layer_gradients(model, inputs.input_ids, -1)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)