In [34]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from dictionary import AutoEncoder



In [35]:
# Load sentences from CSV file
df = pd.read_csv("sentences.csv", delimiter=",", encoding="utf-8", quotechar='"')
sentences = df['sentence'].tolist()

print(sentences)

['The capital of Russia is Moscow.', 'The capital of China is Beijing.', 'The capital of Greece is Athens.', 'The capital of Germany is Berlin.', 'The capital of France is Paris.', 'The capital of the United Kingdom is London.', 'The capital of Japan is Tokyo.', 'The capital of Egypt is Cairo.', 'The capital of Italy is Rome.', 'The capital of Spain is Madrid.', 'The capital of Portugal is Lisbon.', 'The capital of Canada is Ottawa.', 'The capital of Australia is Canberra.', 'The capital of Brazil is Brasília.', 'The capital of India is New Delhi.', 'The capital of the United States is Washington, D.C.', 'The capital of Argentina is Buenos Aires.', 'The capital of Mexico is Mexico City.', 'The capital of South Korea is Seoul.', 'The capital of Indonesia is Jakarta.', 'The capital of Thailand is Bangkok.', 'The capital of Norway is Oslo.', 'The capital of Sweden is Stockholm.', 'The capital of Finland is Helsinki.', 'The capital of Poland is Warsaw.', 'The capital of Austria is Vienna.'

In [36]:
# Load the Pythia model and tokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer.pad_token = tokenizer.eos_token

In [37]:
activation_list = []

def hook_fn(module, input, output):
    """Hook function to capture activations from the 4th MLP layer."""
    activation_list.append(output)

# Hook 4th MLP layer (index 3)
layer_to_hook = model.gpt_neox.layers[3].mlp
hook = layer_to_hook.register_forward_hook(hook_fn)


In [38]:
# Store per-token activations
individual_activations = []

for sentence in sentences:
    print(f"\nProcessing: '{sentence}'")
    
    # Tokenize sentence
    input_ids_batch = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Run model
    model(**input_ids_batch)
    
    # Retrieve activations for this sentence
    if activation_list:
        activations = activation_list[-1]  # Shape: (1, seq_len, hidden_dim)
        activations = activations.squeeze(0)  # → (seq_len, hidden_dim)
        individual_activations.append(activations)
    activation_list.clear()  # Clear after each sentence

print(f"Captured activations for {len(individual_activations)} sentences.")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processing: 'The capital of Russia is Moscow.'

Processing: 'The capital of China is Beijing.'

Processing: 'The capital of Greece is Athens.'

Processing: 'The capital of Germany is Berlin.'

Processing: 'The capital of France is Paris.'

Processing: 'The capital of the United Kingdom is London.'

Processing: 'The capital of Japan is Tokyo.'

Processing: 'The capital of Egypt is Cairo.'

Processing: 'The capital of Italy is Rome.'

Processing: 'The capital of Spain is Madrid.'

Processing: 'The capital of Portugal is Lisbon.'

Processing: 'The capital of Canada is Ottawa.'

Processing: 'The capital of Australia is Canberra.'

Processing: 'The capital of Brazil is Brasília.'

Processing: 'The capital of India is New Delhi.'

Processing: 'The capital of the United States is Washington, D.C.'

Processing: 'The capital of Argentina is Buenos Aires.'

Processing: 'The capital of Mexico is Mexico City.'

Processing: 'The capital of South Korea is Seoul.'

Processing: 'The capital of Indone

In [39]:
# Load Dictionary Learning AutoEncoder
ae = AutoEncoder.from_pretrained(
    "dictionaries/pythia-70m-deduped/mlp_out_layer3/10_32768/ae.pt", 
    map_location=torch.device('cpu')
)

# Per-token sparse representations
sparse_representations = []

for activations in individual_activations:
    # Encode each token separately to sparse features
    per_token_sparse = ae.encode(activations).detach().cpu().numpy()  # (seq_len, dict_size)
    sparse_representations.append(per_token_sparse)

print(f"Processed {len(sparse_representations)} sentences into token-aligned sparse representations.")



  state_dict = t.load(path, map_location=map_location)


Processed 26 sentences into token-aligned sparse representations.


In [41]:
# Analyze which features activate for specific tokens
top_n = 800  
top_features = []

for sentence_idx, per_token_features in enumerate(sparse_representations):
    sentence_top_features = []
    
    for token_idx, features in enumerate(per_token_features):
        # Extract top N active features for this token
        top_indices = np.argsort(features)[-top_n:][::-1]
        sentence_top_features.append(set(top_indices))
    
    top_features.append(sentence_top_features)  # Store per-token top feature indices

# Example: Print feature activations for each token in the first sentence
tokenized_sentence = tokenizer(sentences[0])['input_ids']
decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

print("\nFeature activations for the first sentence:")
for token, feature_set in zip(decoded_tokens, top_features[0]):
    print(f"Token: {token}, Top Features: {list(feature_set)[:5]}")  # Show top 5 features

    # Example: Print feature activations for each token in the first sentence
tokenized_sentence = tokenizer(sentences[1])['input_ids']
decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

print("\nFeature activations for the first sentence:")
for token, feature_set in zip(decoded_tokens, top_features[0]):
    print(f"Token: {token}, Top Features: {list(feature_set)[:5]}")  # Show top 5 features


Feature activations for the first sentence:
Token: The, Top Features: [10245, 10246, 10247, 10248, 10249]
Token: Ġcapital, Top Features: [10251, 10252, 10253, 10254, 10255]
Token: Ġof, Top Features: [24589, 8219, 10269, 10270, 10271]
Token: ĠRussia, Top Features: [6154, 10260, 6172, 10272, 10273]
Token: Ġis, Top Features: [6154, 10266, 10267, 6172, 10269]
Token: ĠMoscow, Top Features: [6154, 10276, 10277, 10278, 20519]
Token: ., Top Features: [10245, 10246, 10247, 10248, 10249]

Feature activations for the first sentence:
Token: The, Top Features: [10245, 10246, 10247, 10248, 10249]
Token: Ġcapital, Top Features: [10251, 10252, 10253, 10254, 10255]
Token: Ġof, Top Features: [24589, 8219, 10269, 10270, 10271]
Token: ĠChina, Top Features: [6154, 10260, 6172, 10272, 10273]
Token: Ġis, Top Features: [6154, 10266, 10267, 6172, 10269]
Token: ĠBeijing, Top Features: [6154, 10276, 10277, 10278, 20519]
Token: ., Top Features: [10245, 10246, 10247, 10248, 10249]


In [33]:
sentence = "The capital of Russia is Moscow. Moscow is in Russia."
tokenized_sentence = tokenizer(sentence)['input_ids']
decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

print("Tokenized output:", decoded_tokens)


Tokenized output: ['The', 'Ġcapital', 'Ġof', 'ĠRussia', 'Ġis', 'ĠMoscow', '.', 'ĠMoscow', 'Ġis', 'Ġin', 'ĠRussia', '.']
