In [98]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from tabulate import tabulate  
from dictionary import AutoEncoder

In [99]:
#Load sentences from CSV file
df = pd.read_csv("sentences.csv", delimiter=",", encoding="utf-8", quotechar='"')
sentences = df['sentence'].tolist()

print(sentences)

['Moscow is in Russia.', 'Beijing is in China.', 'Athens is in Greece.', 'Berlin is in Germany.', 'Paris is in France.', 'London is in the United Kingdom.', 'Tokyo is in Japan.', 'Cairo is in Egypt.', 'Rome is in Italy.', 'Madrid is in Spain.', 'Lisbon is in Portugal.', 'Ottawa is in Canada.', 'Canberra is in Australia.', 'Brasília is in Brazil.', 'New Delhi is in India.', 'Washington, D.C. is in the United States.', 'Buenos Aires is in Argentina.', 'Mexico City is in Mexico.', 'Seoul is in South Korea.', 'Jakarta is in Indonesia.', 'Bangkok is in Thailand.', 'Oslo is in Norway.', 'Stockholm is in Sweden.', 'Helsinki is in Finland.', 'Warsaw is in Poland.', 'Vienna is in Austria.', 'Frank is a teacher.']


In [100]:
#Load the Pythia model and tokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped")
tokenizer.pad_token = tokenizer.eos_token

In [101]:
activation_list = []

def hook_fn(module, input, output):
    """Hook function to capture activations from the 4th MLP layer."""
    # Print input and output shapes for each token
    print(f"Input to layer: {input[0].shape}")  # Input shape
    print(f"Output from layer: {output.shape}")  # Output shape
    
    # Capture the activations
    activation_list.append(output)

# Hooking layer at index 3 (4th layer)
layer_to_hook = model.gpt_neox.layers[3].mlp
hook = layer_to_hook.register_forward_hook(hook_fn)

individual_activations = []  # Store activations for each sentence separately

for sentence in sentences:
    print(f"\nProcessing sentence: '{sentence}'")
    
    # Tokenize the single sentence
    input_ids_batch = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    print(f"Tokenized input_ids_batch: {input_ids_batch}")
    
    print("Running model on the single sentence...")
    output = model(**input_ids_batch)
    print("Model run completed")
    
    # Retrieve activations from MLP layer for this sentence
    if activation_list:
        print(f"Number of activations added: {len(activation_list)}")
        activations = activation_list[-1]  # Get the last activation (corresponding to this sentence)
        individual_activations.append(activations)
        print(f"Activations for the sentence: {activations.shape}")
    else:
        print("No activations captured for this sentence.")
    
    # Clear activation list after each sentence to capture activations separately for the next sentence
    activation_list.clear()

# Print all individual activations
print("\nAll individual activations captured:")
for idx, activations in enumerate(individual_activations):
    print(f"Sentence {idx+1} activations shape: {activations.shape}")

print(len(individual_activations))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processing sentence: 'Moscow is in Russia.'
Tokenized input_ids_batch: {'input_ids': tensor([[   46, 15635,   310,   275,  7422,    15]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
Running model on the single sentence...
Input to layer: torch.Size([1, 6, 512])
Output from layer: torch.Size([1, 6, 512])
Model run completed
Number of activations added: 1
Activations for the sentence: torch.Size([1, 6, 512])

Processing sentence: 'Beijing is in China.'
Tokenized input_ids_batch: {'input_ids': tensor([[ 4678, 16741,   310,   275,  4135,    15]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
Running model on the single sentence...
Input to layer: torch.Size([1, 6, 512])
Output from layer: torch.Size([1, 6, 512])
Model run completed
Number of activations added: 1
Activations for the sentence: torch.Size([1, 6, 512])

Processing sentence: 'Athens is in Greece.'
Tokenized input_ids_batch: {'input_ids': tensor([[   34, 49966,   310,   275, 17785,    15]]), 'attention_mask': tensor([[1

Input to layer: torch.Size([1, 5, 512])
Output from layer: torch.Size([1, 5, 512])
Model run completed
Number of activations added: 1
Activations for the sentence: torch.Size([1, 5, 512])

All individual activations captured:
Sentence 1 activations shape: torch.Size([1, 6, 512])
Sentence 2 activations shape: torch.Size([1, 6, 512])
Sentence 3 activations shape: torch.Size([1, 6, 512])
Sentence 4 activations shape: torch.Size([1, 6, 512])
Sentence 5 activations shape: torch.Size([1, 5, 512])
Sentence 6 activations shape: torch.Size([1, 7, 512])
Sentence 7 activations shape: torch.Size([1, 6, 512])
Sentence 8 activations shape: torch.Size([1, 6, 512])
Sentence 9 activations shape: torch.Size([1, 6, 512])
Sentence 10 activations shape: torch.Size([1, 6, 512])
Sentence 11 activations shape: torch.Size([1, 7, 512])
Sentence 12 activations shape: torch.Size([1, 6, 512])
Sentence 13 activations shape: torch.Size([1, 7, 512])
Sentence 14 activations shape: torch.Size([1, 8, 512])
Sentence 15 a

In [104]:
from dictionary import AutoEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


ae = AutoEncoder.from_pretrained(
    "dictionaries/pythia-70m-deduped/mlp_out_layer3/10_32768/ae.pt", 
    map_location=torch.device('cpu')
)

sparse_representations = []  # Store the sparse representations for all sentences

# Collect the sparse representations for all sentences
for activations in individual_activations:
    print(f"Activations shape before reshaping: {activations.shape}")
    features = ae.encode(activations)  # Get the sparse feature representation
    sparse_representations.append(features.detach().cpu().numpy())  # Store sparse features as NumPy array for analysis


print(len(sparse_representations))
print(sparse_representations[0].shape)

top_features = []
top_n = 800  # Number of top features to extract per sentence

for features in sparse_representations:
    features = features.flatten()  # Convert to 1D
    top_indices = np.argsort(features)[-top_n:][::-1]  # Get indices of top 5 active features
    top_features.append(set(top_indices))  # Store as a set
    top_values = features[top_indices]  # Get the values of the top features
#     print(f"Top {top_n} features for current sentence: {list(zip(top_indices, top_values))}")

# Find shared features across all sentences
shared_features = set.intersection(*top_features)
print("Shared Top Features:", shared_features)




Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 5, 512])
Activations shape before reshaping: torch.Size([1, 7, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 7, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 7, 512])
Activations shape before reshaping: torch.Size([1, 8, 512])
Activations shape before reshaping: torch.Size([1, 6, 512])
Activations shape before reshaping: torch.Size([1, 12, 512])
Activations shape before reshaping: tor

IndexError: index 20383 is out of bounds for axis 0 with size 61