In [3]:
import random

import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from dictionary import AutoEncoder
from datasets import load_dataset



In [2]:
# Get 1000 sentences from the pile. 

# Load The Pile in streaming mode
dataset = load_dataset("monology/pile-uncopyrighted", split="train", streaming=True)

# Collect 1,000 random sentences
random_sentences = []
for i, example in enumerate(dataset):
    random_sentences.append(example["text"])
    if len(random_sentences) >= 100:
        break

# Save or process sentences
with open("random_sentences.txt", "w") as f:
    for sentence in random_sentences:
        f.write(sentence + "\n")


In [4]:
# Load the Pythia model and tokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
activation_list = []

def hook_fn(module, input, output):
    """Hook function to capture activations from the 4th MLP layer."""
    activation_list.append(output)

# Hook 4th MLP layer (index 3)
layer_to_hook = model.gpt_neox.layers[4].mlp
hook = layer_to_hook.register_forward_hook(hook_fn)


In [7]:
# read from random_sentences.txt
random_sentences = []
with open('random_sentences.txt', 'r') as f:
    lines = f.readlines()
    for i, line in enumerate(lines):
        if i > 1000:
            break
        random_sentences.append(line)
        
print(random_sentences[200])
    

            <description>Immunization Registry Status</description>



In [8]:
print(random_sentences[0])

It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web. Playing on the web works, but you have to simulate multi-touch for table moving and that can be a bit confusing.



In [9]:
# Store per-token activations
individual_activations = [] 
    
for i, sentence in enumerate(random_sentences):
    input_ids_batch = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    model(**input_ids_batch)  # Forward pass to capture activations

    if activation_list:
        # We are saving the seq_len, hidden_dim amount of activations. which is cool. 
        activations = activation_list[-1].squeeze(0)  # Shape: (seq_len, hidden_dim)
        individual_activations.append(activations)
    activation_list.clear()

print(f"Captured activations for {len(individual_activations)} sentences.")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Captured activations for 1001 sentences.


In [10]:
# Load Dictionary Learning AutoEncoder
ae = AutoEncoder.from_pretrained(
    "dictionaries/pythia-70m-deduped/mlp_out_layer4/10_32768/ae.pt",
    # Let torch automatially choose device based on asvail.
    # map_location=torch.device('cpu')
)



In [11]:
# Convert activations to sparse representations
sparse_representations = []
for activations in individual_activations:
    sparse_repr = ae.encode(activations).detach().cpu().numpy()  # (seq_len, dict_size)
    sparse_representations.append(sparse_repr)
print(f"Processed {len(sparse_representations)} sentences into token-aligned sparse representations.")

Processed 1001 sentences into token-aligned sparse representations.


In [12]:
dict_size = len(sparse_representations[0][0]) 
print(dict_size)

32768


In [None]:
# Aggregate features: Find top activated features across all tokens in all sentences
vals = [0 for i in range(dict_size)]
num = [0 for i in range(dict_size)]
context = [[] for i in range(dict_size)]
for i, sentence_features in enumerate(sparse_representations):
    for j, token_features in enumerate(sentence_features):
        # Can features be negative???
        # Does this 800 make sense anymore?
        top_indices = np.argsort(token_features)[-2000:][::-1]  # Top 800 features per token
        # Aggregate all the features across a sentence. Do set intersection afterwards so that only feature common across all the sentences are considered. We'll also maintain the average value for all the seen features. then, we'll get the decoded output with these valeus. Then, we'll scale those weights alone by 10x. using  this new model, we'll do text generation and see if the feature common across all sentences shows up a lot, which would confirm our hypothesis. 
        # for idx in top_indices:
        #     feature_counts[idx] = feature_counts.get(idx, 0) + 1
        # Aggregate features across sentences. 
        for idx in top_indices:
            num[idx] += 1
            vals[idx] += token_features[idx]
            context[idx].append((i, j))
    
    
            

In [12]:
# Get the mean vals
mean = [val / num if num > 0 else 0 for val, num in zip(vals, num)]
print(len(mean))
print(mean[:10])

32768
[0, 0, 0, 0, np.float32(0.1258342), np.float32(0.14910175), 0, 0, 0, np.float32(0.076006316)]


In [15]:
# Filter: only consider indices that happen more than 5 times. 
lots_samples = [[idx, num, mean[idx]] for idx, num in enumerate(num) if num > 5]
print(len(lots_samples))

9503


In [16]:
# Sort mean of those indices.
lots_samples.sort(key=lambda x: x[2], reverse=True)
print(len(lots_samples))
print(lots_samples[:10])

9503
[[20383, 2193, np.float32(12.272572)], [18192, 1753, np.float32(5.0215034)], [25721, 110, np.float32(2.8978717)], [22113, 126, np.float32(2.6947615)], [1186, 102, np.float32(2.6079156)], [6122, 27, np.float32(2.577307)], [32061, 28, np.float32(2.443505)], [26654, 1558, np.float32(2.418172)], [25956, 32, np.float32(2.303011)], [6065, 122, np.float32(2.105288)]]


In [38]:
# Get the context for the indices.
sig = [(context[idx], idx, num, mean) for idx, num, mean in lots_samples]
print(len(sig))
print(sig[0])

9503
([(0, 0), (0, 6), (1, 0), (2, 0), (2, 12), (3, 0), (4, 0), (4, 2), (5, 0), (6, 0), (6, 15), (7, 0), (8, 0), (8, 1), (8, 42), (8, 43), (9, 0), (10, 0), (10, 23), (11, 0), (12, 0), (12, 13), (12, 72), (12, 77), (12, 86), (13, 0), (14, 0), (14, 27), (15, 0), (16, 0), (16, 16), (16, 17), (16, 32), (17, 0), (18, 0), (18, 36), (19, 0), (20, 0), (20, 29), (20, 30), (21, 0), (22, 0), (22, 19), (22, 38), (23, 0), (24, 0), (24, 27), (25, 0), (26, 0), (26, 22), (26, 62), (27, 0), (28, 0), (28, 15), (29, 0), (30, 0), (30, 8), (30, 21), (30, 28), (31, 0), (32, 0), (32, 10), (33, 0), (34, 0), (34, 26), (35, 0), (36, 0), (36, 20), (36, 32), (36, 73), (37, 0), (38, 0), (38, 10), (39, 0), (40, 0), (40, 18), (41, 0), (42, 0), (42, 13), (43, 0), (44, 0), (44, 1), (45, 0), (46, 0), (46, 9), (46, 10), (47, 0), (48, 0), (48, 1), (48, 18), (48, 71), (49, 0), (50, 0), (50, 29), (50, 46), (51, 0), (52, 0), (52, 16), (52, 17), (52, 77), (52, 107), (53, 0), (54, 0), (54, 1), (55, 0), (56, 0), (56, 28), (57,

In [40]:
# Leets sort by len of the sig. Secondary key will be the mean value. 
sig = sorted(sig, key = lambda x: (len(x[0]), -x[3]))

In [82]:
# Filter for only stuff where mean > 1.5
sig = [x for x in sig if x[3] > 1.5]
print(len(sig))
print(sig[0])

20
([(111, 3), (275, 20), (329, 6), (382, 142), (582, 19), (582, 21)], 21821, 6, np.float32(2.0713027))


In [89]:
context, idx, num, mean = sig[0]
print(len(context))
print(idx)
print(mean)

37
29401
1.6737142


In [90]:
# Print those tokens out. 

for i, j in context:# Tokenize the sentece. 
    # Print the j'th token
    print(i, j) 
    print(random_sentences[i])
    tokenized_sentence = tokenizer(random_sentences[i])['input_ids']
    #j'th token
    print(tokenizer.convert_ids_to_tokens(tokenized_sentence[j]))

0 38
It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web. Playing on the web works, but you have to simulate multi-touch for table moving and that can be a bit confusing.

Ġmulti
26 6
Conversations with my inspiring co-worker Roushey (who also created the “Mechanical Underdogs” signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist – by evolving from a normal dinner table.

Ġco
30 44
Your plate can spawn little pieces of pasta. You do so by “ordering” them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying costs, which are debited from your credits (you start with a number of credits).

Ġdeb
82 1
Other non-obvious features had to be dropped, too. Fo

Layer 3  \
Index. My interpretation. Mean value  \
21821. Month. 2.07150  \
6122. Hyphen in Hyphenated words. 2.58  \
32061. Parenthesis. 2.44  \
25956. Want to say abbreviations/captilization. 2.30  \
29401. Prefixes like un, non, re. 1.67  \


In [104]:
# While generating, I need to get the activations from the model. Send it to the sae with the weight of an index clamped to 5. Then reinject those back into the model. I will look at Colin's script for this. 

In [6]:
from inference import Pythia70Model

# Use Colin's code to modify inference. 
seventy_m_model = Pythia70Model()
seventy_m_model.clamping("We took a", "dictionaries/pythia-70m-deduped/mlp_out_layer3/10_32768/ae.pt", do_clamping=False, clamping_layer=3, clamping_index=6122, clamping_value=5.0)

['We took a look at the two-']


In [5]:
from inference import Pythia70Model

# Use Colin's code to modify inference. 
seventy_m_model = Pythia70Model()
seventy_m_model.clamping("We took a", "dictionaries/pythia-70m-deduped/mlp_out_layer3/10_32768/ae.pt", do_clamping=True, clamping_layer=3, clamping_index=6122, clamping_value=5.0)

['We took a--- rather-']


In [2]:
from inference import Pythia70Model

# Use Colin's code to modify inference. 
seventy_m_model = Pythia70Model()
seventy_m_model.clamping("The leaves change colour and fall from the trees in", "dictionaries/pythia-70m-deduped/mlp_out_layer3/10_32768/ae.pt", do_clamping=True, clamping_layer=3, clamping_index=6122, clamping_value=5.0)

Next token tensor(253)
Next token tensor(187)
Next token tensor(2366)
Next token tensor(15)
Next token tensor(187)
['The leaves change colour and fall from the trees in the\n together.\n']


In [1]:
# Outputs before my stuff
test_input = "I'm feeling a bit" 

# Tokenize the input
inputs = tokenizer(test_input, return_tensors="pt", padding="max_length", truncation = True)
attention_mask = inputs['attention_mask']  # Get the attention_mask
with torch.no_grad():
    output = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'])

# Decode the output
decoded_output = tokenizer.decode(output[0].tolist(), skip_special_tokens=True)

# Print decoded output
print(decoded_output)


NameError: name 'tokenizer' is not defined

Talk to me. I'm not sure I'm going to be able to do that. I'm not sure I  \ 

The word of the day is the same as the word of the day.  \

The mental health of the patient is not the same as the physical health of the patient. The mental health of  \

0.67 is the highest value of 0.67  \

Every pilot needs a new set of skills to be used in the future.  \










In [26]:
acts_np = decoded_activations.detach().numpy()
# Arbitrary threshold
print(len(acts_np[acts_np > 1]))
# non_zero_indices = np.nonzero(decoded_activations)
# print(non_zero_indices.shape)

35


In [28]:
acts_np_thresh = acts_np[acts_np > 1]
print(acts_np_thresh.shape)

(35,)


In [29]:
# Weight scaling


# What part of the decoded outputs are significant?
# Considering non-zero indices for now

# Scale all the indices by 10
# weight[acts_np_thresh] *= 5

# Add 5 bias to all the non-zero indices
bias[acts_np_thresh] += 5

# write back to the model
state_dict['mlp.dense_4h_to_h.weight'] = weight
state_dict['mlp.dense_4h_to_h.bias'] = bias

model.gpt_neox.layers[3].load_state_dict(state_dict)

<All keys matched successfully>

In [30]:
#  Test out hypothesis

# Tokenize the input

# Is there a better way to do this?
# Pass the tokenized input through the model
# Tokenize the input
test_input = "I like eating"
inputs = tokenizer(test_input, return_tensors="pt", padding="max_length", truncation = True)
attention_mask = inputs['attention_mask']  # Get the attention_mask
with torch.no_grad():
    output = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'])

# Decode the output
decoded_output = tokenizer.decode(output[0].tolist(), skip_special_tokens=True)

# Print decoded output
print(decoded_output)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


I like eating a lot of a long time to the dayly and to the dayly and to the dayly


In [92]:
# # Analyze which features activate for specific tokens
# top_n = 800  
# top_features = []

# for sentence_idx, per_token_features in enumerate(sparse_representations):
#     sentence_top_features = []
    
#     for token_idx, features in enumerate(per_token_features):
#         # Extract top N active features for this token
#         top_indices = np.argsort(features)[-top_n:][::-1]
#         sentence_top_features.append(set(top_indices))
    
#     top_features.append(sentence_top_features)  # Store per-token top feature indices

# # Example: Print feature activations for each token in the first sentence
# tokenized_sentence = tokenizer(sentences[0])['input_ids']
# decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

# print("\nFeature activations for the first sentence:")
# for token, feature_set in zip(decoded_tokens, top_features[0]):
#     print(f"Token: {token}, Top Features: {list(feature_set)[:10]}")  # Show top 5 features

#     # Example: Print feature activations for each token in the first sentence
# tokenized_sentence = tokenizer(sentences[1])['input_ids']
# decoded_tokens = tokenizer.convert_ids_to_tokens(tokenized_sentence)

# print("\nFeature activations for the first sentence:")
# for token, feature_set in zip(decoded_tokens, top_features[0]):
#     print(f"Token: {token}, Top Features: {list(feature_set)[:10]}")  # Show top 5 features
    


In [18]:
# Select the most frequently occurring features
top_common_features = sorted(feature_counts, key=feature_counts.get, reverse=True)[:800]

# # Create a synthetic sparse vector using these common features
# synthetic_sparse_vector = np.zeros((32768,))  # Assume dictionary size is 32768
# for idx in top_common_features:
#     synthetic_sparse_vector[idx] = 1  # Set these features as active

In [19]:
print(top_common_features[:10])

[np.int64(10900), np.int64(10969), np.int64(10968), np.int64(10918), np.int64(10919), np.int64(10967), np.int64(10966), np.int64(10920), np.int64(10965), np.int64(10921)]


In [94]:
# Decode sparse vector back into model space
synthetic_dense_vector = ae.decode(torch.tensor(synthetic_sparse_vector).float()).detach().cpu()
synthetic_dense_vector *= 10  # Experiment with scaling


In [95]:
# Add a new special token
tokenizer.add_special_tokens({'additional_special_tokens': ["<XYZ>"]})
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to include new token

masked_sentence = "The capital of <XYZ> is"
input_ids = tokenizer(masked_sentence, return_tensors="pt")["input_ids"]

# Convert token IDs to tokens
decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(f"Tokenized input: {decoded_tokens}")  # Debugging

# **Find the placeholder index**
try:
    placeholder_index = decoded_tokens.index("<XYZ>")
except ValueError:
    raise ValueError(f"Could not find placeholder token in: {decoded_tokens}")


Tokenized input: ['The', 'Ġcapital', 'Ġof', 'Ġ', '<XYZ>', 'Ġis']


In [82]:

# Convert input_ids to embeddings
model_inputs = model.get_input_embeddings()(input_ids)

# Inject synthetic feature vector at the placeholder position
model_inputs[:, placeholder_index, :] = synthetic_dense_vector

# Generate text from modified embeddings
with torch.no_grad():
    outputs = model(inputs_embeds=model_inputs)
    logits = outputs.logits[:, -1, :]  # Get last token logits
    predicted_token_id = torch.argmax(logits, dim=-1).item()

# Decode the predicted token
predicted_word = tokenizer.decode([predicted_token_id])

print(f"Predicted token: {predicted_word}")

Predicted token:  the
