# Scoring explanations after generation

Here we will show a simple example of how to score the explanations generated

In [1]:
import os
from functools import partial

import orjson
import torch

from delphi.clients import OpenRouter
from delphi.config import ExperimentConfig, LatentConfig
from delphi.explainers import explanation_loader
from delphi.latents import LatentDataset
from delphi.latents.constructors import constructor
from delphi.latents.samplers import sample
from delphi.pipeline import Pipeline, process_wrapper
from delphi.scorers import FuzzingScorer

API_KEY = os.getenv("OPENROUTER_API_KEY")

In [2]:
latent_cfg = LatentConfig(
    width=131072, # The number of latents of your SAE
    min_examples=200, 
    # The minimum number of examples to consider for the latent to be explained
    max_examples=10000, # The maximum number of examples to be sampled from
    n_splits=5 # How many splits was the cache split into
)


In [3]:
module = ".model.layers.10" # The layer to score
latent_dict = {module: torch.arange(0,3)} # The what latents to score




We need to define the config for the examples shown to the scorer model.
When selecting the examples to be shown to the scorer model we can select them from:
- "quantiles", which gets examples from the quantiles of the data
- "activations", which gets examples from activation bins.

In [4]:

experiment_cfg = ExperimentConfig(
    n_examples_test=10, # Number of examples to sample for testing
    n_quantiles=10, # Number of quantiles to divide the data into
    test_type="quantiles", # Type of sampler to use for testing. 
    n_non_activating=10, # Number of non-activating examples to sample
    example_ctx_len=32, # Length of each example
)


The constructor and sampler here are the same as the ones used in the generation of the explanations.


In [5]:
example_constructor=partial(
            constructor,
            n_not_active=experiment_cfg.n_non_activating, 
            ctx_len=experiment_cfg.example_ctx_len, 
            max_examples=latent_cfg.max_examples
        )
sampler=partial(sample,cfg=experiment_cfg)
dataset = LatentDataset(
        raw_dir="latents", # The folder where the cache is stored
        cfg=latent_cfg,
        modules=[module],
        latents=latent_dict,
        constructor=constructor,
        sampler=sampler
)
    

Although we could generate the explanations in the pipeline, here we load the explanations already generated. Then we define the scorer. Because the scorer should use information from the previous pipe, we have a preprocess and a postprocess function.

In [6]:
client = OpenRouter("anthropic/claude-3.5-sonnet",api_key=API_KEY)

# Load the explanations already generated
explainer_pipe = partial(explanation_loader, explanation_dir="results/explanations")


# Builds the record from result returned by the pipeline
def scorer_preprocess(result):
        record = result.record   
        record.explanation = result.explanation
        record.extra_examples = record.not_active

        return record

# Saves the score to a file
def scorer_postprocess(result, score_dir):
    with open(f"results/scores/{result.record.latent}.txt", "wb") as f:
        f.write(orjson.dumps(result.score))


scorer_pipe = process_wrapper(
    FuzzingScorer(client, tokenizer=dataset.tokenizer),
    preprocess=scorer_preprocess,
    postprocess=partial(scorer_postprocess, score_dir="fuzz"),
)

Now our pipeline only has three steps.

In [8]:
!mkdir -p results/scores


In [9]:
pipeline = Pipeline(
    dataset,
    explainer_pipe,
    scorer_pipe,
)
number_of_parallel_latents = 10
await pipeline.run(number_of_parallel_latents) 
# This will start generating the explanations.



No available randomly sampled non-activating sequences


Processing items: 2it [00:08,  4.25s/it]


[None, None]