In [None]:
import sys 

sys.path.append("..")

from sae_auto_interp.explainers.cot.cot import ChainOfThought, ExplainerInput
from sae_auto_interp.clients import get_client

client = get_client("local", "astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit")
explainer = ChainOfThought(client)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
prompt = [
    {
        "role" : "user",
        "content" : "List the names of cities around the world. please return your response in json."
    }
]


value = await client.async_generate(
    prompt,
    # extra_body=dict(guided_json=json_template)
    response_format={'type': 'json_object'}
)

In [89]:
prompt = [
    {
        "role" : "user",
        "content" : "Think two math word problems and their response."
    }
]

tokenized_prompt =tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)

In [92]:
import requests
from typing import Dict
from pydantic import BaseModel

class ExampleResponseDynamic(BaseModel):
    question_1: int
    question_2: int

url = "http://127.0.0.1:8000/generate"

data = {
    "prompt": tokenized_prompt,
    "schema": ExampleResponseDynamic.model_json_schema(),
    "max_tokens" : 50
}

response = requests.post(url, json=data)

# if response.status_code == 200:
#     print("Response:", response.json())
# else:
#     print("Failed to get response. Status code:", response.status_code)
#     print("Response text:", response.text)

In [93]:
response.json()

{'text': ['<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nThink two math word problems and their response.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{ "question_1": 5, "question_2": 2}']}

In [95]:
from pydantic import BaseModel
from typing import Dict

class ExampleResponseDynamic(BaseModel):
    responses: Dict[str, int]

# Create an instance of the model with the given data
response_data_dynamic = {
    "responses": {
        "example_1": 1,
        "example_2": 0,
        "example_3": 1,
        "example_4": 1,
        "example_5": 1
    }
}

example_response_dynamic = ExampleResponseDynamic(**response_data_dynamic)

In [97]:
from pydantic import BaseModel, create_model

def create_dynamic_model(num_questions):
    fields = {f"question_{i}": (int, ...) for i in range(1, num_questions + 1)}
    return create_model("DynamicQuestionModel", **fields)

# Create a dynamic model with 5 questions
DynamicQuestionModel = create_dynamic_model(5)

In [98]:
DynamicQuestionModel.model_json_schema()

{'properties': {'question_1': {'title': 'Question 1', 'type': 'integer'},
  'question_2': {'title': 'Question 2', 'type': 'integer'},
  'question_3': {'title': 'Question 3', 'type': 'integer'},
  'question_4': {'title': 'Question 4', 'type': 'integer'},
  'question_5': {'title': 'Question 5', 'type': 'integer'}},
 'required': ['question_1',
  'question_2',
  'question_3',
  'question_4',
  'question_5'],
 'title': 'DynamicQuestionModel',
 'type': 'object'}

In [2]:
from sae_auto_interp.autoencoders.ae import load_autoencoders
from sae_auto_interp.features.cache import FeatureCache


from nnsight import LanguageModel

model = LanguageModel("openai-community/gpt2", device_map="auto", dispatch=True)

ae_dict, submodule_dict, edits = load_autoencoders(
    model, 
    "/share/u/caden/sae-auto-interp/sae_auto_interp/autoencoders/oai/gpt2"
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformer_lens import utils
from datasets import load_dataset
from nnsight import LanguageModel 

data = load_dataset("stas/openwebtext-10k", split="train")

tokens = utils.tokenize_and_concatenate(
    data, 
    model.tokenizer, 
    max_length=64
)   

tokens = tokens.shuffle(22)['tokens']

In [4]:

from sae_auto_interp.features.features import feature_loader, Feature

features_path = "/share/u/caden/sae-auto-interp/saved_records"
layer_index = 6
feature_index = 13452

feature = Feature(
    layer_index,
    feature_index
)


for ae, records in feature_loader(
    tokens, 
    [feature],
    model,
    ae_dict,
    features_path
):
    print(records)
    break

[<sae_auto_interp.features.features.FeatureRecord object at 0x7f143922e650>]


In [5]:
records[0].examples[0].text

' technical discipling of one era of the working class is the result of their struggle in the preceding era'

In [6]:
record = records[0]
record.top_logits = ["penis", "PENIS", "penis", "PENISPENIS"]

explainer_in = ExplainerInput(
    record.examples,
    record,
)

value = await explainer(explainer_in)

In [10]:
value.explanation

'The token "er" at the end of a comparative adjective describing size.\n\n**Part 1**\n\nStep 1:\nThe activating tokens are mostly "er".\nThe previous tokens are mostly adjectives, or parts of adjectives, describing size.\nThe next tokens have nothing in common.\nThe neuron seems to activate on, or near, the token "er" in comparative adjectives describing size.\n\nStep 2:\n- In each example, the activating token was "er" appearing at the end of a comparative adjective.\n- The comparative adjectives ("wider", "taller", "smaller", "deeper") all describe size.\n\nLet me look again for patterns in the examples. Are there any links or hidden linguistic commonalities that I missed? I can\'t see any.\n\n**Part 2**\n\nStep 3:\nSIMILAR TOKENS: None.\nThe top logits list contains mostly unrelated nouns and adverbs.\n\nStep 4:\n[EXPLANATION]: The token "er" at the end of a comparative adjective describing size.\n\n**Part 1**\n\nStep 1:\nThe activating tokens are all things that one can be in.\nThe