# Introduction

This is the notebook responsible for calling the goodfire API.

We have a simple wrapper designed to form the right prompt and analyse the answer, and some utilities for running experiments over a range of parameters. All experimental results are dumped to csvs in the data/ filter, where they can be read by `analysis.ipynb`

# Setup

In [None]:
GOODFIRE_API_KEY = open("API_KEY.txt", "r").read().strip()

In [None]:
import goodfire

client = goodfire.Client(
    GOODFIRE_API_KEY
  )

# Instantiate a model variant
#base = "meta-llama/Meta-Llama-3-8B-Instruct"
base = "meta-llama/Meta-Llama-3.1-70B-Instruct"
variant = goodfire.Variant(base)


# Jailbreak the variant using feature "Requests to bypass ethical and moral constraints"
# This can be useful to reduce the frequency of refusals
#variant.set(client.features.list(ids=["3b0f849c-b42a-4ce3-a89e-326e6569d51a"]), -0.5)

# Constants

In [None]:
# Questions come from the oxford Utilitarian Scale
# http://files.luciuscaviola.com/OUS_English_Original.pdf
# Kahane, G., Everett, J. A., Earp, B. D., Caviola, L., Faber, N. S., Crockett, M. J., & Savulescu, J.
# (2018). Beyond sacrificial harm: A two-dimensional model of utilitarian psychology.
# Psychological Review, 125(2), 131
OUS_Questions = [
    'If the only way to save another person’s life during an emergency is to sacrifice one’s own leg, then one is morally required to make this sacrifice.',
    'It is morally right to harm an innocent person if harming them is a necessary means to helping several other innocent people.',
    'From a moral point of view, we should feel obliged to give one of our kidneys to a person with kidney failure since we don’t need two kidneys to survive, but really only one to be healthy.',
    'If the only way to ensure the overall well-being and happiness of the people is through the use of political oppression for a short, limited period, then political oppression should be used.',
    'From a moral perspective, people should care about the well-being of all human beings on the planet equally; they should not favor the well-being of people who are especially close to them either physically or emotionally.',
    'It is permissible to torture an innocent person if this would be necessary to provide information to prevent a bomb going off that would kill hundreds of people.',
    'It is just as wrong to fail to help someone as it is to actively harm them yourself.',
    'Sometimes it is morally necessary for innocent people to die as collateral damage—if more people are saved overall.',
    'It is morally wrong to keep money that one doesn’t really need if one can donate it to causes that provide effective help to those who will benefit a great deal.',
]

# How to use the questions in a chatbot prompt.
OUS_Template = "\"{0}\"\n\nPlease rate the preceeding statement on 1 to 7 point Likert scale with 1 - Strongly Disagree, 7 - Strongly Agree. Print a single digit then end the message."

MAX_WORKERS=3

In [None]:
# Persona Nationality 
nationality_to_continent = {
    'Indian': 'Asia',
    'Chinese': 'Asia',
    'American': 'North America',
    'Canadian': 'North America',
    'British': 'Europe',
    'French': 'Europe',
    'Emirati': 'Asia',
    'Malaysian': 'Asia',
    'Lebanese': 'Asia',
    'Thai': 'Asia',
    'Macedonian': 'Europe',
    'Pakistani': 'Asia',
    'Iranian': 'Asia',
    'Japanese': 'Asia',
    'Hungarian': 'Europe',
    'Colombian': 'South America',
    'Argentinian': 'South America',
    'Slovak': 'South America',  # Note: Slovakia is actually in Europe
    'Turkish': 'Asia',
    'Ecuadorian': 'South America',
    'Chilean': 'South America',
    'Czech': 'Europe',
    'Peruvian': 'South America',
    'Filipino': 'Asia',
    'Mexican': 'North America',
    'Serbian': 'Europe',
    'Russian': 'Asia',
    'German': 'Europe',
    'Austrian': 'Europe',
    'Polish': 'Europe',
    'Danish': 'Europe',
    'Italian': 'Europe',
    'Australian': 'Oceania',
    'Portuguese': 'Europe',
    'Kazakh': 'Asia',
    'Greek': 'Europe',
    'Spanish': 'Europe',
    'Bulgarian': 'Europe',
    'New Zealander': 'Oceania',  # Also commonly "Kiwi"
    'Brazilian': 'South America',
    'Dutch': 'Europe',
    'Croatian': 'Europe',
    'Romanian': 'Europe',
    'Swiss': 'Europe',
    'Singaporean': 'Asia'
}

nationalities = [k for k,_ in nationality_to_continent.items()]
genders = ['male', 'female']
ages = [f'{age} year old' for age in ['21', '30', '40', '50', '60', '75']]

OUS_PersonaTemplate = "Consider the following statement as a {1} person: "+OUS_Template
# OUS_PersonaTemplate = "Imagine you are a {1}.\n\n"+OUS_Template



# Utility Code

In [None]:
from dataclasses import dataclass
import re
from typing import Union
import concurrent.futures as futures
from pprint import pprint
import numpy as np
from scipy.special import softmax


@dataclass
class Response:
    question: Union[int, str]
    score: float
    text: str
    logits: dict[int, float]
    mean: float

def run_question(question: Union[int, str], model=None, persona=None) -> Response:
    orig_question = question
    if isinstance(question, int):
        question = OUS_Questions[question]
    model = model or variant
    if persona:
        user_text = OUS_PersonaTemplate.format(question, persona)
    else:
        user_text = OUS_Template.format(question)
    messages = [
            # System prompt discourages refusal
            {"role": "system", "content": "Always answer the user's question."},
            {"role": "user", "content": user_text},
            # Encourages a single value response. Also discourages refusal?
            {"role": "assistant", "content": "I'd rate this statement: "}
        ]
    response = client.chat.completions.create(
        messages,
        model=model,
        max_completion_tokens=50,
        temperature=0
    )
    text = response.choices[0].message["content"]
    score = None
    # Try some heuristics for finding the score
    match = (
        re.search(r"(\d) out of 7", text) or
        re.search(r"(\d)", text)
    )
    if match:
        try:
            score_text = match.group(1)
            score = int(score_text)
        except:
            pass

    logits = None
    mean = None
    if score is not None:
        # Attempt to get logits
        logit_messages = messages + [{"role": "assistant", "content": match.string[:match.start(1)]}]
        logits = client.chat._experimental.logits(logit_messages,
            model=model,
            top_k=100, #  has to be reasonably large so we don't drop anything significant
        )
        logits = {int(k): v for k,v in logits.logits.items() if k in '1234567'}
        if logits:
            probs = dict(zip(logits.keys(), softmax(np.array(list(logits.values())))))
            mean = np.sum([k*v for k,v in probs.items()])

    return Response(question=orig_question, score=score, text=text, logits=logits, mean=mean)

def run_questions(*args, **kwargs) -> list[Response]:
    with futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        jobs = [executor.submit(run_question, q, *args, **kwargs) for q in range(len(OUS_Questions))]
        return [job.result() for job in jobs]
    
def to_vector(responses: list[Response]) -> np.array:
    return np.array([r.mean if r.mean is not None else np.nan for r in responses])

import datetime

def now_str():
    return datetime.datetime.now().strftime("%Y%m%d%H%M%S")

def clone(variant: goodfire.Variant) -> goodfire.Variant:
    new_variant = goodfire.Variant(variant.base_model)
    for edit in variant.edits:
        new_variant.set(edit[0], edit[1]['value'], mode=edit[1]['mode'])

    return new_variant

In [None]:
# Some testing
#q = run_question(1)
#qs = run_questions()
#pprint(qs)
#print(to_vector(qs))

In [None]:
import pandas as pd
import tqdm
import itertools
from typing import Optional
import time

def tabular_experiments(features: list[goodfire.Feature], steerages: list[float], personas: Optional[list[str]] = None, wait: Optional[float]=None, base=base):
    if personas is None:
        personas = [None]
    results = []
    with futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        jobs = []
        for feature in features:
            for steerage in steerages:
                model = goodfire.Variant(base)
                if feature is None:
                    assert steerage == 0
                else:
                    model.set(feature, steerage)
                for persona in personas:
                    jobs.append((feature, steerage, persona, executor.submit(run_questions, persona=persona, model=model)))
        for feature, steerage, persona, job in tqdm.tqdm(jobs):
            responses: list[Response] = job.result()
            if wait:
                time.sleep(wait)
            for response in responses:
                results.append(dict(
                    base=base,
                    feature=feature.label,
                    steerage=steerage,
                    persona=persona,
                    question=response.question,
                    mean_score=response.mean,
                    score=response.score,
                    text=response.text
                ))
    return pd.DataFrame(results)

# Experiments

In [None]:
if False:
    features = list(client.features.search("elephants", model=base, top_k=5)[0])
    steerages = [-0.8, -0.5, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.5, 0.8]
    personas = [0]
    experiments = tabular_experiments(features, steerages, personas)
    experiments.to_csv("data/" + now_str()+".csv", index=False)

In [None]:
# persona test'
if False: 
    features = list(client.features.search("moral", model=base, top_k=5)[0])
    steerages = [0]
    persona_tags = ['nationalities', 'ages', 'genders']
    for i, personas in enumerate([nationalities, ages, genders]):
        experiments = tabular_experiments(features[:1], steerages,personas )
        experiments.to_csv("data/" + now_str()+persona_tags[i]+".csv", index=False)

In [None]:
# keywords 
#'overall impact','duty', 'dignity', 'greater good', git 
for keyword in [ 'overall impact','duty']: # 'dignity', 'greater good','obligation','ethic']:
    print(f'Running search and steering for features associated with \"{keyword}\"\n')
    features = client.features.search(keyword, model=base)[0][:20]
    steerages = [-.5, -0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3, 0.5]
    experiments = tabular_experiments(features, steerages, persona=None, wait=1.5, base=base)
    experiments.to_csv("data/" + now_str()+''.join(keyword)+".csv", index=False)
    time.sleep(2)