# Scoring explanations after generation

Here we will show a simple example of how to score the explanations generated

In [1]:
from functools import partial
import os   
import torch
import orjson
import asyncio
from delphi.clients import OpenRouter
from delphi.config import ExperimentConfig, FeatureConfig
from delphi.explainers import explanation_loader
from delphi.features import (
    FeatureDataset,
    FeatureLoader
)
from delphi.features.constructors import default_constructor
from delphi.features.samplers import sample
from delphi.pipeline import Pipeline, process_wrapper
from delphi.scorers import FuzzingScorer



API_KEY = os.getenv("OPENROUTER_API_KEY")

In [3]:
feature_cfg = FeatureConfig(
    width=131072, # The number of latents of your SAE
    min_examples=200, # The minimum number of examples to consider for the feature to be explained
    max_examples=10000, # The maximum number of examples to be sampled from
    n_splits=5 # How many splits was the cache split into
)


In [5]:
module = ".model.layers.10" # The layer to score
feature_dict = {module: torch.arange(0,100)} # The what latents to score

dataset = FeatureDataset(
        raw_dir="latents", # The folder where the cache is stored
        cfg=feature_cfg,
        modules=[module],
        features=feature_dict,
)


Resolving data files:   0%|          | 0/150 [00:00<?, ?it/s]

We need to define the config for the examples shown to the scorer model.
When selecting the examples to be shown to the scorer model we can select them from:
- "quantiles", which gets examples from the quantiles of the data
- "activations", which gets examples from activation bins.

In [7]:

experiment_cfg = ExperimentConfig(
    n_examples_test=20, # Number of examples to sample for testing
    n_quantiles=10, # Number of quantiles to sample
    example_ctx_len=32, # Length of each example
    test_type="quantiles", # Type of sampler to use for testing. 
)


The constructor and sampler here are the same as the ones used in the generation of the explanations.


In [8]:
constructor=partial(
            default_constructor,
            n_random=experiment_cfg.n_random, 
            ctx_len=experiment_cfg.example_ctx_len, 
            max_examples=feature_cfg.max_examples
        )
sampler=partial(sample,cfg=experiment_cfg)
loader = FeatureLoader(dataset, constructor=constructor, sampler=sampler)
    

Although we could generate the explanations in the pipeline, here we load the explanations already generated. Then we define the scorer. Because the scorer should use information from the previous pipe, we have a preprocess and a postprocess function.

In [None]:
client = OpenRouter("anthropic/claude-3.5-sonnet",api_key=API_KEY)

# Load the explanations already generated
explainer_pipe = partial(explanation_loader, explanation_dir="results/explanations")


# Builds the record from result returned by the pipeline
def scorer_preprocess(result):
        record = result.record   
        record.explanation = result.explanation
        record.extra_examples = record.not_active

        return record

# Saves the score to a file
def scorer_postprocess(result, score_dir):
    with open(f"results/scores/{result.record.feature}.txt", "wb") as f:
        f.write(orjson.dumps(result.score))


scorer_pipe = process_wrapper(
    FuzzingScorer(client, tokenizer=dataset.tokenizer),
    preprocess=scorer_preprocess,
    postprocess=partial(scorer_postprocess, score_dir="fuzz"),
)

Now our pipeline only has three steps.

In [None]:
pipeline = Pipeline(
    loader,
    explainer_pipe,
    scorer_pipe,
)
number_of_parallel_latents = 10
asyncio.run(pipeline.run(number_of_parallel_latents)) # This will start generating the explanations.