In [233]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from dictionary import AutoEncoder



In [234]:
# Load sentences from CSV file
df = pd.read_csv("sentences.csv", delimiter=",", encoding="utf-8", quotechar='"')
sentences = df['sentence'].tolist()

print(sentences)

['Black is technically the absence of visible light, which makes it pretty fascinating when you think about it.', 'You know, black is one of those colors that just never goes out of style.', 'It’s interesting how black is often associated with sophistication and elegance—think of a classic black tuxedo.', 'In many cultures, black symbolizes mystery, power, and sometimes mourning.', 'Black holes are some of the most mysterious objects in the universe—they’re literally regions where light can’t escape.', 'When it comes to fashion, black is the go-to color for a timeless, sleek look.', 'Isn’t it cool how black objects absorb more heat because they absorb all wavelengths of light?', 'Matte black has such a modern, bold aesthetic—it’s so popular in car designs these days.', 'Black is often used in branding when companies want to convey authority and reliability.', 'Ever notice how black makes other colors stand out more when used as a background?', 'In photography, black-and-white images ha

In [235]:
# Load the Pythia model and tokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer.pad_token = tokenizer.eos_token

In [242]:
# Testing tokenizer 
sentence = "You know, black is one of those colors that just never goes out of style."
tokenized_sentence = tokenizer(sentence)['input_ids']
decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

print("Tokenized output:", decoded_tokens)

Tokenized output: ['You', 'Ġknow', ',', 'Ġblack', 'Ġis', 'Ġone', 'Ġof', 'Ġthose', 'Ġcolors', 'Ġthat', 'Ġjust', 'Ġnever', 'Ġgoes', 'Ġout', 'Ġof', 'Ġstyle', '.']


In [237]:
activation_list = []

def hook_fn(module, input, output):
    """Hook function to capture activations from the 4th MLP layer."""
    activation_list.append(output)

# Hook 4th MLP layer (index 3)
layer_to_hook = model.gpt_neox.layers[3].mlp
hook = layer_to_hook.register_forward_hook(hook_fn)


In [249]:
# Store per-token activations
individual_activations = []

# Target values
target_values = ['black','Black','ĠBlack','Ġblack']
    
for sentence in sentences:
    print(f"\nProcessing: '{sentence}'")
    input_ids_batch = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids_batch['input_ids'][0])
    model(**input_ids_batch)  # Forward pass to capture activations

    target_idx = [i for i, word in enumerate(decoded_tokens) if word in target_values]

    if not target_idx:
        print(f"Target word not found in sentence: {decoded_tokens}")
        continue
    else:
        target_idx = target_idx[0]

    if activation_list:
        activations = activation_list[-1].squeeze(0)  # Shape: (seq_len, hidden_dim)
        individual_activations.append((activations,target_idx))
    activation_list.clear()

print(f"Captured activations for {len(individual_activations)} sentences.")



Processing: 'Black is technically the absence of visible light, which makes it pretty fascinating when you think about it.'

Processing: 'You know, black is one of those colors that just never goes out of style.'

Processing: 'It’s interesting how black is often associated with sophistication and elegance—think of a classic black tuxedo.'

Processing: 'In many cultures, black symbolizes mystery, power, and sometimes mourning.'

Processing: 'Black holes are some of the most mysterious objects in the universe—they’re literally regions where light can’t escape.'

Processing: 'When it comes to fashion, black is the go-to color for a timeless, sleek look.'

Processing: 'Isn’t it cool how black objects absorb more heat because they absorb all wavelengths of light?'

Processing: 'Matte black has such a modern, bold aesthetic—it’s so popular in car designs these days.'

Processing: 'Black is often used in branding when companies want to convey authority and reliability.'

Processing: 'Ever no

In [227]:
# Load Dictionary Learning AutoEncoder
ae = AutoEncoder.from_pretrained(
    "dictionaries/pythia-70m-deduped/mlp_out_layer3/10_32768/ae.pt", 
    map_location=torch.device('cpu')
)

  state_dict = t.load(path, map_location=map_location)


In [186]:
# Convert activations to sparse representations
sparse_representations = []
for activations in individual_activations:
    sparse_repr = ae.encode(activations).detach().cpu().numpy()  # (seq_len, dict_size)
    sparse_representations.append(sparse_repr)
print(f"Processed {len(sparse_representations)} sentences into token-aligned sparse representations.")

Processed 20 sentences into token-aligned sparse representations.


In [187]:
# Aggregate features: Find top activated features across all tokens in all sentences
feature_counts = {}
for sentence_features in sparse_representations:
    for token_features in sentence_features:
        top_indices = np.argsort(token_features)[-800:][::-1]  # Top 800 features per token
        for idx in top_indices:
            feature_counts[idx] = feature_counts.get(idx, 0) + 1
            

In [188]:
feature_counts

{20383: 39,
 21462: 25,
 7082: 14,
 18192: 31,
 9123: 9,
 9109: 10,
 31054: 8,
 9312: 39,
 9081: 12,
 31330: 24,
 18068: 3,
 26245: 4,
 29597: 13,
 3054: 26,
 18413: 4,
 15852: 19,
 15559: 7,
 22072: 4,
 19443: 5,
 11928: 13,
 2083: 16,
 30233: 2,
 23882: 15,
 19206: 15,
 16219: 7,
 8305: 9,
 22566: 14,
 10887: 377,
 26654: 26,
 18194: 18,
 1176: 38,
 22463: 18,
 15500: 17,
 20890: 19,
 3112: 8,
 29932: 23,
 24979: 1,
 28308: 12,
 1427: 11,
 24646: 12,
 14179: 52,
 24194: 17,
 16022: 25,
 19007: 14,
 27926: 29,
 3778: 8,
 27215: 12,
 22922: 30,
 23812: 8,
 19171: 27,
 26631: 28,
 24408: 14,
 18719: 5,
 32311: 37,
 24440: 9,
 23028: 7,
 22800: 14,
 26287: 27,
 7132: 20,
 25962: 38,
 22593: 7,
 5798: 29,
 7793: 11,
 6969: 13,
 20369: 9,
 7834: 14,
 27226: 12,
 12385: 12,
 9785: 5,
 578: 22,
 31967: 18,
 28529: 26,
 13450: 7,
 8071: 4,
 7180: 3,
 18268: 14,
 12077: 17,
 20977: 7,
 20654: 24,
 3858: 17,
 11309: 19,
 14034: 15,
 26162: 1,
 6023: 26,
 24777: 17,
 8683: 14,
 11233: 63,
 29649

In [189]:
# # Analyze which features activate for specific tokens
# top_n = 800  
# top_features = []

# for sentence_idx, per_token_features in enumerate(sparse_representations):
#     sentence_top_features = []
    
#     for token_idx, features in enumerate(per_token_features):
#         # Extract top N active features for this token
#         top_indices = np.argsort(features)[-top_n:][::-1]
#         sentence_top_features.append(set(top_indices))
    
#     top_features.append(sentence_top_features)  # Store per-token top feature indices

# # Example: Print feature activations for each token in the first sentence
# tokenized_sentence = tokenizer(sentences[0])['input_ids']
# decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

# print("\nFeature activations for the first sentence:")
# for token, feature_set in zip(decoded_tokens, top_features[0]):
#     print(f"Token: {token}, Top Features: {list(feature_set)[:10]}")  # Show top 5 features

#     # Example: Print feature activations for each token in the first sentence
# tokenized_sentence = tokenizer(sentences[1])['input_ids']
# decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

# print("\nFeature activations for the first sentence:")
# for token, feature_set in zip(decoded_tokens, top_features[0]):
#     print(f"Token: {token}, Top Features: {list(feature_set)[:10]}")  # Show top 5 features
    


In [190]:
# Select the most frequently occurring features
top_common_features = sorted(feature_counts, key=feature_counts.get, reverse=True)[:800]

# Create a synthetic sparse vector using these common features
synthetic_sparse_vector = np.zeros((32768,))  # Assume dictionary size is 32768
for idx in top_common_features:
    synthetic_sparse_vector[idx] = 1  # Set these features as active

In [191]:
# Decode sparse vector back into model space
synthetic_dense_vector = ae.decode(torch.tensor(synthetic_sparse_vector).float()).detach().cpu()
synthetic_dense_vector *= 10  # Experiment with scaling


In [219]:
# Add a new special token

masked_sentence = "Do you prefer the color red or blue? I prefer the color"
input_ids = tokenizer(masked_sentence, return_tensors="pt")["input_ids"]

# Convert token IDs to tokens
decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(f"Tokenized input: {decoded_tokens}")  # Debugging


Tokenized input: ['Do', 'Ġyou', 'Ġprefer', 'Ġthe', 'Ġcolor', 'Ġred', 'Ġor', 'Ġblue', '?', 'ĠI', 'Ġprefer', 'Ġthe', 'Ġcolor']


In [220]:

# Convert input_ids to embeddings
model_inputs = model.get_input_embeddings()(input_ids)

# Inject synthetic feature vector at the placeholder position
# model_inputs[:, placeholder_index, :] = synthetic_dense_vector
# Even without injection returns a filler word; e.g., "the"

# Generate text from modified embeddings
with torch.no_grad():
    outputs = model(inputs_embeds=model_inputs)
    logits = outputs.logits[:, -1, :]  # Get last token logits
    logs, tokens = torch.topk(logits, 10,dim=-1)

print(tokens)

# Decode the predicted token
predicted_words = tokenizer.batch_decode([token.item() for token in tokens[0]])

predicted_words

tensor([[ 4797,  2502,  4759,   273, 13735,  8862, 14863,  3168,  2806, 11978]])


[' blue',
 ' red',
 ' green',
 ' of',
 ' orange',
 ' yellow',
 ' pink',
 ' white',
 ' black',
 ' gray']