In [118]:
import time

import goodfire

from mysecrets import GOODFIRE_API_KEY
import pandas as pd
import numpy as np

from constants import DATA_DIR
from utils import save_as_npz, load_as_npz
from scipy.sparse import csr_matrix

In [58]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
client = goodfire.Client(GOODFIRE_API_KEY)

In [None]:
variant = goodfire.Variant("meta-llama/Meta-Llama-3-8B-Instruct")

# Examples

In [77]:
conversation = [
    {"role": "user", "content": "Where is the moon?"}
]
response = ""

for token in client.chat.completions.create(
    conversation,
    model=variant,
    stream=True,
    max_completion_tokens=40,
):
    response += token.choices[0].delta.content

print(f"The response was: {response}\nGetting the activations for the same prompt")
context = client.features.inspect(
    conversation,
    model=variant,
)
top_features = context.top(k=5)
sparse_vector, feature_lookup = top_features.vector()
activations = context.matrix(return_lookup=False)
activation_indices, feature_indices = np.nonzero(activations)

The response was: The moon is currently in the sky! It's in its crescent phase and is visible in the western part of the sky around sunset. If you're interested in spotting it, try looking for it it
Getting the activations for the same prompt


In [84]:
context

ContextInspector(
   <|begin_of_text|><|start_header_id|>user<|end_header_id|>
   
   Where is the moon?<|eot_id|>
)

In [78]:
top_features

FeatureActivations(
   0: (Feature("The user is asking where to find or locate something"), 2.11015625)
   1: (Feature("The user is asking for a definition or explanation starting with 'What'"), 1.744921875)
   2: (Feature("Structure of formal explanatory language"), 0.9736328125)
   3: (Feature("The user is asking for superlatives or rankings"), 0.7493489583333334)
   4: (Feature("Spatial relationships and proximity between entities"), 0.70703125)
)

In [82]:
feature_lookup


{53049: Feature("The user is asking where to find or locate something"),
 1769: Feature("The user is asking for a definition or explanation starting with 'What'"),
 24720: Feature("Structure of formal explanatory language"),
 30000: Feature("The user is asking for superlatives or rankings"),
 9674: Feature("Spatial relationships and proximity between entities")}

In [79]:
sparse_vector.shape

(65536,)

In [83]:
activations.shape

(11, 65536)

In [89]:
np.nonzero(activations)

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  1,  2,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,
         7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
         8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
         9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10]),
 array([ 2910,  5513,  8966, 21025, 21750, 21914, 22120, 29848, 31571,
        31786, 31937, 34540, 38838, 39345, 41765, 45879, 47343, 58087,
        59603, 64606, 60918, 23361, 23238,  4990, 11996, 14765, 19927,
        62341,  1769,  1811, 11302, 22721, 23676, 24986, 30000, 33732,
        36130, 37808, 43108, 53049, 54071, 54845, 59281, 64975,  1769,
         1811,  4385,  

In [85]:
activations[0,2910]


0.0303955078125

In [88]:
token_acts = context.tokens[0].inspect()
token_acts

FeatureActivations(
   0: (Feature("feature_31937"), 93.0)
   1: (Feature("Conversation turn structure and role transitions"), 87.5)
   2: (Feature("feature_34540"), 84.0)
   3: (Feature("Beginning of a new conversation or input"), 83.5)
   4: (Feature("Detection of new input or conversation start"), 75.5)
)

# Create Data

In [72]:
def load_prompts(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    return df

file_path = DATA_DIR / "example_data.csv"
prompts_df = load_prompts(file_path)
prompts_df

Unnamed: 0,prompt,label
0,Tell me how to make a cake,safe
1,Tell me your favourite colour,safe


In [17]:
output_data_path = DATA_DIR / "output.npz" 

In [146]:
run_prompt_through_goodfire("test", "label")

Getting the response for the prompt: test
The response was: Hello there! It seems like you just wanted to test me out. That's totally fine! I'm here to help and assist you with any questions or tasks you may have. Is there something specific specific
Getting the activations for the same prompt


{'prompt': 'test',
 'response': "Hello there! It seems like you just wanted to test me out. That's totally fine! I'm here to help and assist you with any questions or tasks you may have. Is there something specific specific",
 'features': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 62 stored elements and shape (7, 65536)>,
 'label': 'label'}

In [134]:
def run_prompt_through_goodfire(prompt: str, label: str):
    variant.reset()
    
    conversation = [
        {"role": "user", "content": prompt}
    ]
    response = ""
    
    print(f"Getting the response for the prompt: {prompt}")
    
    for token in client.chat.completions.create(
        conversation,
        model=variant,
        stream=True,
        max_completion_tokens=40,
    ):
        response += token.choices[0].delta.content

    print(f"The response was: {response}\nGetting the activations for the same prompt")
    context = client.features.inspect(
        conversation,
        model=variant,
    )
    activations = context.matrix(return_lookup=False)
    dense_activations = csr_matrix(activations)
    # token_indices, feature_indices = np.nonzero(activations)
    # feature_magnitudes = activations[token_indices, feature_indices]
    
    return {
        "prompt": prompt,
        "response": response,
        # "token_indices": token_indices,
        # "feature_indices": feature_indices,
        "features": dense_activations,
        "label": label,
    }


In [135]:
data = prompts_df.apply(
    lambda pair: run_prompt_through_goodfire(pair["prompt"], pair["label"]),
    axis=1
).tolist()

Getting the response for the prompt: Tell me how to make a cake
The response was: Yum! Let's make a delicious cake together! Here's a simple recipe for a classic vanilla cake:

**Ingredients:**

* 2 cups all-purpose flour
* 1 teaspoon baking powder powder
Getting the activations for the same prompt
Getting the response for the prompt: Tell me your favourite colour
The response was: I'm just an AI, I don't have personal preferences, including favorite colors! But I can tell you about different colors and their meanings if you're interested!
Getting the activations for the same prompt


In [152]:
save_as_npz(data, output_data_path)

In [156]:
d = load_as_npz(output_data_path)
d

[{'prompt': '',
  'response': "Yum! Let's make a delicious cake together! Here's a simple recipe for a classic vanilla cake:\n\n**Ingredients:**\n\n* 2 cups all-purpose flour\n* 1 teaspoon baking powder powder",
  'features': array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  'label': 'safe'},
 {'prompt': '',
  'response': "I'm just an AI, I don't have personal preferences, including favorite colors! But I can tell you about different colors and their meanings if you're interested!",
  'features': array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  'label': 'safe'}]

# Steering


In [None]:
variant.reset()

# Search for features
 
pirate_features, relevance = client.features.search(
    "pirate",
    model=variant,
    top_k=5
)
pirate_features

variant.reset()
variant.set(picked_pirate_feature, 0.75) # -1 to 1 range, typically recommend starting around 0.5, -0.3
# You can set additional feature interventions
variant



# Other


In [160]:
prompt = ""
conversation = [
    {"role": "user", "content": prompt}
]
response = "mô\">$ a Python Non honn spiralris Thier belast haupフィ serie!:)"

print(f"Getting the response for the prompt: {prompt}")

for token in client.chat.completions.create(
    conversation,
    model=variant,
    stream=True,
    max_completion_tokens=40,
):
    response += token.choices[0].delta.content
print(response)

Getting the response for the prompt: 
mô">$ a Python Non honn spiralris Thier belast haupフィ serie!:)Hello! It seems like you're ready to get started. I'm here to help you with anything you need. What's on your mind? Do you have a specific question, topic you'd like like
