In [144]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from dictionary import AutoEncoder



In [162]:
# Load sentences from CSV file
df = pd.read_csv("sentences.csv", delimiter=",", encoding="utf-8", quotechar='"')
sentences = df['sentence'].tolist()

print(sentences)



In [163]:
# Load the Pythia model and tokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_tokens(['llama','llamas','Llamas','Llama'])

4

In [165]:
# Testing tokenizer 
sentence = "With their padded feet, llamas can walk on rough terrain without messing up the ground."
tokenized_sentence = tokenizer(sentence)['input_ids']
decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

print("Tokenized output:", decoded_tokens)

Tokenized output: ['With', 'Ġtheir', 'Ġp', 'added', 'Ġfeet', ',', 'Ġ', 'llamas', 'Ġcan', 'Ġwalk', 'Ġon', 'Ġrough', 'Ġterrain', 'Ġwithout', 'Ġmess', 'ing', 'Ġup', 'Ġthe', 'Ġground', '.']


In [84]:
activation_list = []

def hook_fn(module, input, output):
    """Hook function to capture activations from the 4th MLP layer."""
    activation_list.append(output)

# Hook 4th MLP layer (index 3)
layer_to_hook = model.gpt_neox.layers[3].mlp
hook = layer_to_hook.register_forward_hook(hook_fn)


In [85]:
# Store per-token activations
individual_activations = []
    
for sentence in sentences:
    print(f"\nProcessing: '{sentence}'")
    input_ids_batch = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    model(**input_ids_batch)  # Forward pass to capture activations

    if activation_list:
        activations = activation_list[-1].squeeze(0)  # Shape: (seq_len, hidden_dim)
        individual_activations.append(activations)
    activation_list.clear()

print(f"Captured activations for {len(individual_activations)} sentences.")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processing: 'The capital of Russia is Moscow.'

Processing: 'The capital of China is Beijing.'

Processing: 'The capital of Greece is Athens.'

Processing: 'The capital of Germany is Berlin.'

Processing: 'The capital of France is Paris.'

Processing: 'The capital of the United Kingdom is London.'

Processing: 'The capital of Japan is Tokyo.'

Processing: 'The capital of Egypt is Cairo.'

Processing: 'The capital of Italy is Rome.'

Processing: 'The capital of Spain is Madrid.'

Processing: 'The capital of Portugal is Lisbon.'

Processing: 'The capital of Canada is Ottawa.'

Processing: 'The capital of Australia is Canberra.'

Processing: 'The capital of Brazil is Brasília.'

Processing: 'The capital of India is New Delhi.'

Processing: 'The capital of the United States is Washington, D.C.'

Processing: 'The capital of Argentina is Buenos Aires.'

Processing: 'The capital of Mexico is Mexico City.'

Processing: 'The capital of South Korea is Seoul.'

Processing: 'The capital of Indone

In [86]:
# Load Dictionary Learning AutoEncoder
ae = AutoEncoder.from_pretrained(
    "dictionaries/pythia-70m-deduped/mlp_out_layer3/10_32768/ae.pt", 
    map_location=torch.device('cpu')
)

  state_dict = t.load(path, map_location=map_location)


In [87]:
# Convert activations to sparse representations
sparse_representations = []
for activations in individual_activations:
    sparse_repr = ae.encode(activations).detach().cpu().numpy()  # (seq_len, dict_size)
    sparse_representations.append(sparse_repr)
print(f"Processed {len(sparse_representations)} sentences into token-aligned sparse representations.")

Processed 26 sentences into token-aligned sparse representations.


In [88]:
# Aggregate features: Find top activated features across all tokens in all sentences
feature_counts = {}
for sentence_features in sparse_representations:
    for token_features in sentence_features:
        top_indices = np.argsort(token_features)[-800:][::-1]  # Top 800 features per token
        for idx in top_indices:
            feature_counts[idx] = feature_counts.get(idx, 0) + 1
            

In [89]:
feature_counts

{20383: 52,
 18192: 52,
 21462: 48,
 7082: 27,
 29597: 28,
 26654: 52,
 9312: 51,
 3959: 30,
 10395: 177,
 19599: 37,
 10923: 199,
 10925: 199,
 10924: 199,
 10920: 199,
 10922: 199,
 10921: 199,
 10919: 199,
 10918: 199,
 10917: 199,
 10926: 199,
 32767: 177,
 10927: 199,
 10916: 199,
 10929: 199,
 10930: 199,
 10931: 199,
 10932: 199,
 10933: 199,
 10934: 199,
 10935: 199,
 10936: 199,
 10937: 199,
 10938: 199,
 10939: 199,
 10940: 199,
 10941: 199,
 10928: 199,
 10914: 199,
 10915: 199,
 10943: 199,
 10887: 199,
 10888: 199,
 10889: 199,
 10890: 199,
 10891: 199,
 10892: 199,
 10893: 199,
 10894: 199,
 10895: 199,
 10896: 199,
 10897: 199,
 10898: 199,
 10899: 199,
 10900: 199,
 10901: 199,
 10902: 199,
 10903: 199,
 10904: 199,
 10905: 199,
 10906: 199,
 10907: 199,
 10908: 199,
 10909: 199,
 10910: 199,
 10911: 199,
 10912: 199,
 10913: 199,
 10942: 199,
 10945: 199,
 10944: 199,
 10977: 199,
 10979: 199,
 10980: 199,
 10981: 199,
 10982: 199,
 10983: 199,
 10984: 199,
 10985: 199

In [90]:
# # Analyze which features activate for specific tokens
# top_n = 800  
# top_features = []

# for sentence_idx, per_token_features in enumerate(sparse_representations):
#     sentence_top_features = []
    
#     for token_idx, features in enumerate(per_token_features):
#         # Extract top N active features for this token
#         top_indices = np.argsort(features)[-top_n:][::-1]
#         sentence_top_features.append(set(top_indices))
    
#     top_features.append(sentence_top_features)  # Store per-token top feature indices

# # Example: Print feature activations for each token in the first sentence
# tokenized_sentence = tokenizer(sentences[0])['input_ids']
# decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

# print("\nFeature activations for the first sentence:")
# for token, feature_set in zip(decoded_tokens, top_features[0]):
#     print(f"Token: {token}, Top Features: {list(feature_set)[:10]}")  # Show top 5 features

#     # Example: Print feature activations for each token in the first sentence
# tokenized_sentence = tokenizer(sentences[1])['input_ids']
# decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

# print("\nFeature activations for the first sentence:")
# for token, feature_set in zip(decoded_tokens, top_features[0]):
#     print(f"Token: {token}, Top Features: {list(feature_set)[:10]}")  # Show top 5 features
    


In [91]:
# Select the most frequently occurring features
top_common_features = sorted(feature_counts, key=feature_counts.get, reverse=True)[:800]

# Create a synthetic sparse vector using these common features
synthetic_sparse_vector = np.zeros((32768,))  # Assume dictionary size is 32768
for idx in top_common_features:
    synthetic_sparse_vector[idx] = 1  # Set these features as active

In [92]:
# Decode sparse vector back into model space
synthetic_dense_vector = ae.decode(torch.tensor(synthetic_sparse_vector).float()).detach().cpu()
synthetic_dense_vector *= 10  # Experiment with scaling


In [175]:
# Add a new special token
tokenizer.add_special_tokens({'additional_special_tokens': ["<XYZ>"]})
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to include new token

masked_sentence = "Do you prefer dogs or cats? I prefer"
input_ids = tokenizer(masked_sentence, return_tensors="pt")["input_ids"]

# Convert token IDs to tokens
decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(f"Tokenized input: {decoded_tokens}")  # Debugging

# **Find the placeholder index**
"""
try:
    placeholder_index = decoded_tokens.index("<XYZ>")
except ValueError:
    raise ValueError(f"Could not find placeholder token in: {decoded_tokens}")
"""


Tokenized input: ['Do', 'Ġyou', 'Ġprefer', 'Ġdogs', 'Ġor', 'Ġcats', '?', 'ĠI', 'Ġprefer']


'\ntry:\n    placeholder_index = decoded_tokens.index("<XYZ>")\nexcept ValueError:\n    raise ValueError(f"Could not find placeholder token in: {decoded_tokens}")\n'

In [176]:

# Convert input_ids to embeddings
model_inputs = model.get_input_embeddings()(input_ids)

# Inject synthetic feature vector at the placeholder position
# model_inputs[:, placeholder_index, :] = synthetic_dense_vector
# Even without injection returns a filler word; e.g., "the"

# Generate text from modified embeddings
with torch.no_grad():
    outputs = model(inputs_embeds=model_inputs)
    logits = outputs.logits[:, -1, :]  # Get last token logits
    logs, tokens = torch.topk(logits, 10,dim=-1)

print(tokens)

# Decode the predicted token
predicted_words = tokenizer.batch_decode([token.item() for token in tokens[0]])

predicted_words

tensor([[ 9097, 16581,   281,   247,   253,   952,   731,   368,  4370, 29286]])


[' dogs',
 ' cats',
 ' to',
 ' a',
 ' the',
 ' people',
 ' them',
 ' you',
 ' dog',
 ' pets']