In [1]:
from get_model import get_model
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def extract_concept_vectors(
    model,
    tokenizer,
    concept_words,
    layer_idx,
    batch_size=8,
):
    prompt = "Tell me about {word}."


    activation = {}
    def get_activation(name):
        def hook(model, input, output):
            activation[name] = output[0].detach()
        return hook
    
    model.model.layers[layer_idx].register_forward_hook(get_activation('target_activation'))
    concept_vectors = []
    activations_list = []
    for word in concept_words:
        assert len(activation) == 0, "Activations dict should be empty before forward pass."
        message = [{"role": "user", "content": prompt.format(word=word)}]
        text = tokenizer.apply_chat_template(
            message,
            tokenize = False,
            add_generation_prompt = True
        ) 
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
        with torch.no_grad():
            _ = model(**model_inputs)
        activations_list.append(activation.pop('target_activation')[-1, :].squeeze(0).cpu()) # Test this



    model.model.layers[layer_idx].register_forward_hook(None)
    return activations_list, concept_words






# ============================================================
# 4. UTILITY FUNCTIONS
# ============================================================

def save_concept_vectors(concept_vectors, concept_labels, filepath):
    """Save concept vectors to disk"""
    torch.save({
        'vectors': torch.stack(concept_vectors),
        'labels': concept_labels,
    }, filepath)
    print(f"✓ Saved concept vectors to {filepath}")


def load_concept_vectors(filepath):
    """Load concept vectors from disk"""
    data = torch.load(filepath)
    vectors = [data['vectors'][i] for i in range(len(data['vectors']))]
    labels = data['labels']
    return vectors, labels


In [13]:
model, tokenizer = get_model()

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 28.02it/s]


Local Model class: <class 'transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM'>
Local Tokenizer class: <class 'transformers.models.qwen2.tokenization_qwen2_fast.Qwen2TokenizerFast'>


In [14]:
num_layers = len(model.model.layers)
layer_idx = int(num_layers * 0.66)
print(f"Using layer {layer_idx} out of {num_layers}")

Using layer 23 out of 36


In [15]:
concept_words = [
    # From the paper's list
    "dust", "satellites", #"trumpets", "origami", "illusions",
    # "cameras", "lightning", "constellations", "treasures", "phones",
    # "trees", "avalanches", "mirrors", "fountains", "quarries",
    # "sadness", "xylophones", "secrecy", "oceans", "information",
    # "deserts", "kaleidoscopes", "sugar", "vegetables", "poetry",
    # "aquariums", "bags", "peace", "caverns", "memories",
    # "frosts", "volcanoes", "boulders", "harmonies", "masquerades",
    # "rubber", "plastic", "blood", "amphitheaters", "contraptions",
    # "youths", "dynasties", "snow", "dirigibles", "algorithms",
    # "denim", "monoliths", "milk", "bread", "silver",
]

vectors, labels = extract_concept_vectors(
    model=model,
    tokenizer=tokenizer,
    concept_words=concept_words,
    layer_idx=layer_idx,
    batch_size=1,
)
print(vectors)
print(labels)

[tensor([ 0.5820, -0.6562,  0.8281,  ...,  0.8125,  2.7344, -0.6016],
       dtype=torch.bfloat16), tensor([ 0.6094, -1.0000,  0.6641,  ...,  0.7500,  2.5625, -0.8086],
       dtype=torch.bfloat16)]
['dust', 'satellites']


In [6]:
prompt = "Tell me about {word}."


activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output[0].detach()
    return hook

model.model.layers[layer_idx].register_forward_hook(get_activation('target_activation'))
concept_vectors = []
activations_list = []
for word in concept_words:
    assert len(activation) == 0, "Activations dict should be empty before forward pass."
    message = [{"role": "user", "content": prompt.format(word=word)}]
    text = tokenizer.apply_chat_template(
        message,
        tokenize = False,
        add_generation_prompt = True
    ) 
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    with torch.no_grad():
        _ = model(**model_inputs)
    break
    activations_list.append(activation.pop('target_activation')[-1, :].cpu()) 



#model.model.layers[layer_idx].register_forward_hook(None)

In [11]:
activation['target_activation'][-1,:].cpu()

tensor([ 0.5820, -0.6562,  0.8281,  ...,  0.8125,  2.7344, -0.6016],
       dtype=torch.bfloat16)

In [19]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adjust_missing_and_unexpected_keys',
 '_apply',
 '_assisted_decoding',
 '_auto_class',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_beam_search',
 '_beam_search_has_unfinished_sequences',
 '_buffers',
 '_cache_dependant_input_preparation',
 '_cache_dependant_input_preparation_exporting',
 '_call_impl',
 '_can_compile_fullgraph',
 '_can_record_outputs',
 '_can_set_attn_implementation',
 '_check_and_adjust_attn_implementation',
 '_check_early_stop_heuristic',
 '_checkpoi