EleutherAI · SrGonao · Feb 19, 2025 · Oct 3, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ Install this library as a local editable installation. Run the following command
 
 To run the default pipeline from the command line, use the following command:
 
-`python -m delphi meta-llama/Meta-Llama-3-8B EleutherAI/sae-llama-3-8b-32x --explainer_model 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4' --dataset_repo 'EleutherAI/fineweb-edu-dedup-10b' --dataset_split 'train[:1%]' --n_tokens 10_000_000 --max_latents 100 --hookpoints layers.5 --filter_bos --name llama-3-8B` 
+`python -m delphi meta-llama/Meta-Llama-3-8B EleutherAI/sae-llama-3-8b-32x --explainer_model 'hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4' --dataset_repo 'EleutherAI/fineweb-edu-dedup-10b' --dataset_split 'train[:1%]' --n_tokens 10_000_000 --max_latents 100 --hookpoints layers.5 --filter_bos --name llama-3-8B`
 
 This command will:
 1. Cache activations for the first 10 million tokens of EleutherAI/rpj-v2-sample.

diff --git a/delphi/__main__.py b/delphi/__main__.py
@@ -4,15 +4,12 @@
 from functools import partial
 from glob import glob
 from pathlib import Path
-from typing import Callable, cast
+from typing import Callable
 
 import orjson
 import torch
-from datasets import load_dataset
 from simple_parsing import ArgumentParser
-from sparsify.data import chunk_and_tokenize
 from torch import Tensor
-from torchtyping import TensorType
 from transformers import (
     AutoModel,
     AutoTokenizer,
@@ -26,12 +23,12 @@
 from delphi.config import CacheConfig, ExperimentConfig, LatentConfig, RunConfig
 from delphi.explainers import DefaultExplainer
 from delphi.latents import LatentCache, LatentDataset
-from delphi.latents.constructors import default_constructor
-from delphi.latents.samplers import sample
+from delphi.latents.neighbours import NeighbourCalculator
 from delphi.log.result_analysis import log_results
 from delphi.pipeline import Pipe, Pipeline, process_wrapper
 from delphi.scorers import DetectionScorer, FuzzingScorer
-from delphi.sparse_coders import load_hooks_sparse_coders
+from delphi.sparse_coders import load_hooks_sparse_coders, load_sparse_coders
+from delphi.utils import load_tokenized_data
 
 
 def load_artifacts(run_cfg: RunConfig):
@@ -59,6 +56,42 @@ def load_artifacts(run_cfg: RunConfig):
     return run_cfg.hookpoints, hookpoint_to_sparse_encode, model
 
 
+async def create_neighbours(
+    run_cfg: RunConfig,
+    latents_path: Path,
+    neighbours_path: Path,
+    hookpoints: list[str],
+    experiment_cfg: ExperimentConfig,
+):
+    """
+    Creates a neighbours file for the given hookpoints.
+    """
+    neighbours_path.mkdir(parents=True, exist_ok=True)
+
+    if experiment_cfg.neighbours_type != "co-occurrence":
+        saes = load_sparse_coders(run_cfg, device="cuda")
+
+    for hookpoint in hookpoints:
+
+        if experiment_cfg.neighbours_type == "co-occurrence":
+            neighbour_calculator = NeighbourCalculator(
+                cache_dir=latents_path / hookpoint, number_of_neighbours=100
+            )
+
+        elif experiment_cfg.neighbours_type == "decoder_similarity":
+
+            neighbour_calculator = NeighbourCalculator(
+                autoencoder=saes[hookpoint], number_of_neighbours=100
+            )
+
+        elif experiment_cfg.neighbours_type == "encoder_similarity":
+            neighbour_calculator = NeighbourCalculator(
+                autoencoder=saes[hookpoint], number_of_neighbours=100
+            )
+        neighbour_calculator.populate_neighbour_cache(experiment_cfg.neighbours_type)
+        neighbour_calculator.save_neighbour_cache(f"{neighbours_path}/{hookpoint}")
+
+
 async def process_cache(
     latent_cfg: LatentConfig,
     run_cfg: RunConfig,
@@ -88,25 +121,14 @@ async def process_cache(
         latent_dict = {
             hook: latent_range for hook in hookpoints
         }  # The latent range to explain
-        latent_dict = cast(dict[str, int | Tensor], latent_dict)
-
-    constructor = partial(
-        default_constructor,
-        token_loader=None,
-        n_not_active=experiment_cfg.n_non_activating,
-        ctx_len=experiment_cfg.example_ctx_len,
-        max_examples=latent_cfg.max_examples,
-    )
-    sampler = partial(sample, cfg=experiment_cfg)
 
     dataset = LatentDataset(
         raw_dir=str(latents_path),
-        cfg=latent_cfg,
+        latent_cfg=latent_cfg,
+        experiment_cfg=experiment_cfg,
         modules=hookpoints,
         latents=latent_dict,
         tokenizer=tokenizer,
-        constructor=constructor,
-        sampler=sampler,
     )
 
     if run_cfg.explainer_provider == "offline":
@@ -214,14 +236,15 @@ def populate_cache(
     """
     latents_path.mkdir(parents=True, exist_ok=True)
 
-    data = load_dataset(
-        cfg.dataset_repo, name=cfg.dataset_name, split=cfg.dataset_split
+    tokens = load_tokenized_data(
+        cfg.ctx_len,
+        tokenizer,
+        cfg.dataset_repo,
+        cfg.dataset_split,
+        cfg.dataset_name,
+        cfg.dataset_column,
+        run_cfg.seed,
     )
-    data = data.shuffle(run_cfg.seed)
-    data = chunk_and_tokenize(
-        data, tokenizer, max_seq_len=cfg.ctx_len, text_key=cfg.dataset_column
-    )
-    tokens = data["input_ids"]
 
     if run_cfg.filter_bos:
         if tokenizer.bos_token_id is None:
@@ -235,8 +258,6 @@ def populate_cache(
             ]
             tokens = truncated_tokens.reshape(-1, cfg.ctx_len)
 
-    tokens = cast(TensorType["batch", "seq"], tokens)
-
     cache = LatentCache(
         model,
         hookpoint_to_sparse_encode,
@@ -271,6 +292,7 @@ async def run(
     latents_path = base_path / "latents"
     explanations_path = base_path / "explanations"
     scores_path = base_path / "scores"
+    neighbours_path = base_path / "neighbours"
     visualize_path = base_path / "visualize"
 
     latent_range = torch.arange(run_cfg.max_latents) if run_cfg.max_latents else None
@@ -294,6 +316,19 @@ async def run(
         print(f"Files found in {latents_path}, skipping cache population...")
 
     del model, hookpoint_to_sparse_encode
+    if (
+        not glob(str(neighbours_path / ".*")) + glob(str(neighbours_path / "*"))
+        or "neighbours" in run_cfg.overwrite
+    ):
+        await create_neighbours(
+            run_cfg,
+            latents_path,
+            neighbours_path,
+            hookpoints,
+            experiment_cfg,
+        )
+    else:
+        print(f"Files found in {neighbours_path}, skipping...")
 
     if (
         not glob(str(scores_path / ".*")) + glob(str(scores_path / "*"))

diff --git a/delphi/config.py b/delphi/config.py
@@ -29,6 +29,17 @@ class ExperimentConfig(Serializable):
     test_type: Literal["quantiles", "activation"] = "quantiles"
     """Type of sampler to use for latent explanation testing."""
 
+    non_activating_source: Literal["random", "neighbours"] = "random"
+    """Source of non-activating examples. Random uses non-activating contexts
+    sampled from any non activating window. Neighbours uses actvating contexts
+    from pre-computed latent neighbours. They are still non-activating but
+    have a higher chance of being similar to the activating examples."""
+
+    neighbours_type: Literal[
+        "co-occurrence", "decoder_similarity", "encoder_similarity"
+    ] = "co-occurrence"
+    """Type of neighbours to use. Only used if non_activating_source is 'neighbours'."""
+
 
 @dataclass
 class LatentConfig(Serializable):
@@ -145,6 +156,6 @@ class RunConfig:
     scoring speed but can leak information to the fuzzing and detection scorer,
     as well as increasing the scorer LLM task difficulty."""
 
-    overwrite: list[Literal["cache", "scores"]] = list_field()
+    overwrite: list[Literal["cache", "neighbours", "scores"]] = list_field()
     """List of run stages to recompute. This is a debugging tool
     and may be removed in the future."""
diff --git a/delphi/explainers/default/default.py b/delphi/explainers/default/default.py
@@ -1,117 +1,35 @@
 import asyncio
-import re
+from dataclasses import dataclass
 
-from ...logger import logger
-from ..explainer import Explainer, ExplainerResult
+from ..explainer import ActivatingExample, Explainer
 from .prompt_builder import build_prompt
 
 
+@dataclass
 class DefaultExplainer(Explainer):
-    name = "default"
+    activations: bool = True
+    """Whether to show activations to the explainer."""
+    cot: bool = False
+    """Whether to use chain of thought reasoning."""
 
-    def __init__(
-        self,
-        client,
-        tokenizer,
-        verbose: bool = False,
-        activations: bool = False,
-        cot: bool = False,
-        threshold: float = 0.6,
-        temperature: float = 0.0,
-        **generation_kwargs,
-    ):
-        self.client = client
-        self.tokenizer = tokenizer
-        self.verbose = verbose
-
-        self.activations = activations
-        self.cot = cot
-        self.threshold = threshold
-        self.temperature = temperature
-        self.generation_kwargs = generation_kwargs
-
-    async def __call__(self, record):
-        messages = self._build_prompt(record.train)
-
-        response = await self.client.generate(
-            messages, temperature=self.temperature, **self.generation_kwargs
-        )
-
-        try:
-            explanation = self.parse_explanation(response.text)
-            if self.verbose:
-                logger.info(f"Explanation: {explanation}")
-                logger.info(f"Final message to explainer: {messages[-1]['content']}")
-                logger.info(f"Response from explainer: {response.text}")
-
-            return ExplainerResult(record=record, explanation=explanation)
-        except Exception as e:
-            logger.error(f"Explanation parsing failed: {e}")
-            return ExplainerResult(
-                record=record, explanation="Explanation could not be parsed."
-            )
-
-    def parse_explanation(self, text: str) -> str:
-        try:
-            match = re.search(r"\[EXPLANATION\]:\s*(.*)", text, re.DOTALL)
-            return (
-                match.group(1).strip() if match else "Explanation could not be parsed."
-            )
-        except Exception as e:
-            logger.error(f"Explanation parsing regex failed: {e}")
-            raise
-
-    def _highlight(self, index, example):
-        result = f"Example {index}: "
-
-        threshold = example.max_activation * self.threshold
-        if self.tokenizer is not None:
-            str_toks = self.tokenizer.batch_decode(example.tokens)
-            example.str_toks = str_toks
-        else:
-            str_toks = example.tokens
-            example.str_toks = str_toks
-        activations = example.activations
-
-        def check(i):
-            return activations[i] > threshold
-
-        i = 0
-        while i < len(str_toks):
-            if check(i):
-                result += "<<"
-
-                while i < len(str_toks) and check(i):
-                    result += str_toks[i]
-                    i += 1
-                result += ">>"
-            else:
-                result += str_toks[i]
-                i += 1
-
-        return "".join(result)
-
-    def _join_activations(self, example):
-        activations = []
-
-        for i, activation in enumerate(example.activations):
-            if activation > example.max_activation * self.threshold:
-                activations.append(
-                    (example.str_toks[i], int(example.normalized_activations[i]))
-                )
-
-        acts = ", ".join(f'("{item[0]}" : {item[1]})' for item in activations)
-
-        return "Activations: " + acts
-
-    def _build_prompt(self, examples):
+    def _build_prompt(self, examples: list[ActivatingExample]) -> list[dict]:
         highlighted_examples = []
 
         for i, example in enumerate(examples):
-            highlighted_examples.append(self._highlight(i + 1, example))
+            str_toks = self.tokenizer.batch_decode(example.tokens)
+            activations = example.activations.tolist()
+            highlighted_examples.append(self._highlight(str_toks, activations))
 
             if self.activations:
-                highlighted_examples.append(self._join_activations(example))
+                assert (
+                    example.normalized_activations is not None
+                ), "Normalized activations are required for activations in explainer"
+                normalized_activations = example.normalized_activations.tolist()
+                highlighted_examples.append(
+                    self._join_activations(
+                        str_toks, activations, normalized_activations
+                    )
+                )
 
         highlighted_examples = "\n".join(highlighted_examples)
 

diff --git a/delphi/explainers/default/prompt_builder.py b/delphi/explainers/default/prompt_builder.py
@@ -26,10 +26,10 @@ def build_examples(
 
 
 def build_prompt(
-    examples,
+    examples: str,
     activations: bool = False,
     cot: bool = False,
-):
+) -> list[dict]:
     messages = system(
         cot=cot,
     )
@@ -49,7 +49,7 @@ def build_prompt(
             "content": user_start,
         }
     )
-    
+
     return messages