From 965a3ccb32a5bbb6cb1d8977c8307edd18c3cdb9 Mon Sep 17 00:00:00 2001
From: anthonyduong <anthonyduong9@gmail.com>
Date: Wed, 4 Jun 2025 16:51:07 -0700
Subject: [PATCH 1/7] fixes EmbeddingScorer._prepare() passes arg of wrong type

---
 delphi/scorers/embedding/embedding.py | 33 ++++++++++++++-------------
 delphi/scorers/surprisal/surprisal.py | 15 ++++--------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/delphi/scorers/embedding/embedding.py b/delphi/scorers/embedding/embedding.py
index 2de89874..26943623 100644
--- a/delphi/scorers/embedding/embedding.py
+++ b/delphi/scorers/embedding/embedding.py
@@ -51,7 +51,7 @@ async def __call__(  # type: ignore
         random.shuffle(samples)
         results = self._query(
             record.explanation,
-            samples,  # type: ignore
+            samples,
         )
 
         return ScorerResult(record=record, score=results)
@@ -59,30 +59,31 @@ async def __call__(  # type: ignore
     def call_sync(self, record: LatentRecord) -> list[EmbeddingOutput]:
         return asyncio.run(self.__call__(record))  # type: ignore
 
-    def _prepare(self, record: LatentRecord) -> list[list[Sample]]:
+    def _prepare(self, record: LatentRecord) -> list[Sample]:
         """
         Prepare and shuffle a list of samples for classification.
         """
+        samples = []
 
-        defaults = {
-            "tokenizer": self.tokenizer,
-        }
-        samples = examples_to_samples(
-            record.extra_examples,  # type: ignore
-            distance=-1,
-            **defaults,  # type: ignore
-        )
+        if record.extra_examples is not None:
+            samples.extend(
+                examples_to_samples(
+                    record.extra_examples,
+                    tokenizer=self.tokenizer,
+                    distance=-1,
+                )
+            )
 
-        for i, examples in enumerate(record.test):
+        for i, example in enumerate(record.test):
             samples.extend(
                 examples_to_samples(
-                    examples,  # type: ignore
+                    [example],
+                    tokenizer=self.tokenizer,
                     distance=i + 1,
-                    **defaults,  # type: ignore
                 )
             )
 
-        return samples  # type: ignore
+        return samples
 
     def _query(self, explanation: str, samples: list[Sample]) -> list[EmbeddingOutput]:
         explanation_string = (
@@ -110,7 +111,7 @@ def _query(self, explanation: str, samples: list[Sample]) -> list[EmbeddingOutpu
 
 def examples_to_samples(
     examples: list[Example],
-    tokenizer: PreTrainedTokenizer,
+    tokenizer: PreTrainedTokenizer | None,
     **sample_kwargs,
 ) -> list[Sample]:
     samples = []
@@ -118,7 +119,7 @@ def examples_to_samples(
         if tokenizer is not None:
             text = "".join(tokenizer.batch_decode(example.tokens))
         else:
-            text = "".join(example.tokens)
+            text = "".join(str(token) for token in example.tokens)
         activations = example.activations.tolist()
         samples.append(
             Sample(
diff --git a/delphi/scorers/surprisal/surprisal.py b/delphi/scorers/surprisal/surprisal.py
index ee92b1c1..931a4f5c 100644
--- a/delphi/scorers/surprisal/surprisal.py
+++ b/delphi/scorers/surprisal/surprisal.py
@@ -7,8 +7,6 @@
 from torch.nn.functional import cross_entropy
 from transformers import PreTrainedTokenizer
 
-from delphi.utils import assert_type
-
 from ...latents import ActivatingExample, Example, LatentRecord
 from ..scorer import Scorer, ScorerResult
 from .prompts import BASEPROMPT as base_prompt
@@ -74,24 +72,19 @@ def _prepare(self, record: LatentRecord) -> list[Sample]:
         Prepare and shuffle a list of samples for classification.
         """
 
-        defaults = {
-            "tokenizer": self.tokenizer,
-        }
-
         assert record.extra_examples is not None, "No extra examples provided"
         samples = examples_to_samples(
             record.extra_examples,
+            tokenizer=self.tokenizer,
             distance=-1,
-            **defaults,
         )
 
-        for i, examples in enumerate(record.test):
-            examples = assert_type(list, examples)
+        for i, example in enumerate(record.test):
             samples.extend(
                 examples_to_samples(
-                    examples,
+                    [example],
+                    tokenizer=self.tokenizer,
                     distance=i + 1,
-                    **defaults,
                 )
             )
 

From 3809ed398fe49c9a71351b10c102ecde74753707 Mon Sep 17 00:00:00 2001
From: SrGonao <goncalo@eleuther.ai>
Date: Thu, 12 Jun 2025 08:49:58 -0400
Subject: [PATCH 2/7] Make str_tokens not optional

---
 delphi/latents/latents.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/delphi/latents/latents.py b/delphi/latents/latents.py
index 5e5611c2..d1190932 100644
--- a/delphi/latents/latents.py
+++ b/delphi/latents/latents.py
@@ -75,12 +75,6 @@ class Example:
     activations: Float[Tensor, "ctx_len"]
     """Activation values for the input sequence."""
 
-    str_tokens: list[str] | None = None
-    """Tokenized input sequence as strings."""
-
-    normalized_activations: Optional[Float[Tensor, "ctx_len"]] = None
-    """Activations quantized to integers in [0, 10]."""
-
     @property
     def max_activation(self) -> float:
         """
@@ -98,6 +92,12 @@ class ActivatingExample(Example):
     An example of a latent that activates a model.
     """
 
+    str_tokens: list[str]
+    """Tokenized input sequence as strings."""
+
+    normalized_activations: Float[Tensor, "ctx_len"]
+    """Activations quantized to integers in [0, 10]."""
+
     quantile: int = 0
     """The quantile of the activating example."""
 
@@ -108,6 +108,9 @@ class NonActivatingExample(Example):
     An example of a latent that does not activate a model.
     """
 
+    str_tokens: list[str]
+    """Tokenized input sequence as strings."""
+
     distance: float = 0.0
     """
     The distance from the neighbouring latent.

From 7f984043393555974584b35c6f31a44088b9da3c Mon Sep 17 00:00:00 2001
From: SrGonao <goncalo@eleuther.ai>
Date: Thu, 12 Jun 2025 08:50:57 -0400
Subject: [PATCH 3/7] Correct logic for examples to samples

---
 delphi/scorers/embedding/embedding.py | 70 +++++++++++++--------------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/delphi/scorers/embedding/embedding.py b/delphi/scorers/embedding/embedding.py
index 26943623..0e6f44b7 100644
--- a/delphi/scorers/embedding/embedding.py
+++ b/delphi/scorers/embedding/embedding.py
@@ -1,9 +1,9 @@
 import asyncio
 import random
 from dataclasses import dataclass
-from typing import NamedTuple
+from typing import NamedTuple, Sequence
 
-from transformers import PreTrainedTokenizer
+from delphi.latents.latents import ActivatingExample, NonActivatingExample
 
 from ...latents import Example, LatentRecord
 from ..scorer import Scorer, ScorerResult
@@ -33,19 +33,17 @@ class EmbeddingScorer(Scorer):
     def __init__(
         self,
         model,
-        tokenizer: PreTrainedTokenizer | None = None,
         verbose: bool = False,
         **generation_kwargs,
     ):
         self.model = model
         self.verbose = verbose
-        self.tokenizer = tokenizer
         self.generation_kwargs = generation_kwargs
 
-    async def __call__(  # type: ignore
-        self,  # type: ignore
-        record: LatentRecord,  # type: ignore
-    ) -> ScorerResult:  # type: ignore
+    async def __call__(
+        self,
+        record: LatentRecord,
+    ) -> ScorerResult:
         samples = self._prepare(record)
 
         random.shuffle(samples)
@@ -56,8 +54,8 @@ async def __call__(  # type: ignore
 
         return ScorerResult(record=record, score=results)
 
-    def call_sync(self, record: LatentRecord) -> list[EmbeddingOutput]:
-        return asyncio.run(self.__call__(record))  # type: ignore
+    def call_sync(self, record: LatentRecord) -> ScorerResult:
+        return asyncio.run(self.__call__(record))
 
     def _prepare(self, record: LatentRecord) -> list[Sample]:
         """
@@ -65,23 +63,21 @@ def _prepare(self, record: LatentRecord) -> list[Sample]:
         """
         samples = []
 
-        if record.extra_examples is not None:
-            samples.extend(
-                examples_to_samples(
-                    record.extra_examples,
-                    tokenizer=self.tokenizer,
-                    distance=-1,
-                )
+        assert (
+            record.extra_examples is not None
+        ), "Extra (non-activating) examples need to be provided"
+
+        samples.extend(
+            examples_to_samples(
+                record.extra_examples,
             )
+        )
 
-        for i, example in enumerate(record.test):
-            samples.extend(
-                examples_to_samples(
-                    [example],
-                    tokenizer=self.tokenizer,
-                    distance=i + 1,
-                )
+        samples.extend(
+            examples_to_samples(
+                record.test,
             )
+        )
 
         return samples
 
@@ -94,38 +90,38 @@ def _query(self, explanation: str, samples: list[Sample]) -> list[EmbeddingOutpu
         query_embeding = self.model.encode(explanation_prompt)
         samples_text = [sample.text for sample in samples]
 
-        # # Temporary batching
-        # sample_embedings = []
-        # for i in range(0, len(samples_text), 10):
-        #     sample_embedings.extend(self.model.encode(samples_text[i:i+10]))
         sample_embedings = self.model.encode(samples_text)
         similarity = self.model.similarity(query_embeding, sample_embedings)[0]
 
         results = []
         for i in range(len(samples)):
-            # print(i)
             samples[i].data.similarity = similarity[i].item()
             results.append(samples[i].data)
         return results
 
 
 def examples_to_samples(
-    examples: list[Example],
-    tokenizer: PreTrainedTokenizer | None,
-    **sample_kwargs,
+    examples: Sequence[Example],
 ) -> list[Sample]:
     samples = []
     for example in examples:
-        if tokenizer is not None:
-            text = "".join(tokenizer.batch_decode(example.tokens))
-        else:
-            text = "".join(str(token) for token in example.tokens)
+        assert isinstance(example, ActivatingExample) or isinstance(
+            example, NonActivatingExample
+        )
+        text = "".join(str(token) for token in example.str_tokens)
         activations = example.activations.tolist()
         samples.append(
             Sample(
                 text=text,
                 activations=activations,
-                data=EmbeddingOutput(text=text, **sample_kwargs),
+                data=EmbeddingOutput(
+                    text=text,
+                    distance=(
+                        example.quantile
+                        if isinstance(example, ActivatingExample)
+                        else example.distance
+                    ),
+                ),
             )
         )
 

From 94f3dd73d70ce21d3f069bac5322d31ace144988 Mon Sep 17 00:00:00 2001
From: SrGonao <goncalo@eleuther.ai>
Date: Thu, 12 Jun 2025 08:53:14 -0400
Subject: [PATCH 4/7] Clean up surprisal as well

---
 delphi/scorers/surprisal/surprisal.py | 52 ++++++++++++++-------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/delphi/scorers/surprisal/surprisal.py b/delphi/scorers/surprisal/surprisal.py
index 931a4f5c..1e10829c 100644
--- a/delphi/scorers/surprisal/surprisal.py
+++ b/delphi/scorers/surprisal/surprisal.py
@@ -1,13 +1,17 @@
 import random
 from dataclasses import dataclass
-from typing import NamedTuple
+from typing import NamedTuple, Sequence
 
 import torch
 from simple_parsing import field
 from torch.nn.functional import cross_entropy
-from transformers import PreTrainedTokenizer
 
-from ...latents import ActivatingExample, Example, LatentRecord
+from ...latents import (
+    ActivatingExample,
+    Example,
+    LatentRecord,
+    NonActivatingExample,
+)
 from ..scorer import Scorer, ScorerResult
 from .prompts import BASEPROMPT as base_prompt
 
@@ -42,21 +46,19 @@ class SurprisalScorer(Scorer):
     def __init__(
         self,
         model,
-        tokenizer,
         verbose: bool,
         batch_size: int,
         **generation_kwargs,
     ):
         self.model = model
         self.verbose = verbose
-        self.tokenizer = tokenizer
         self.batch_size = batch_size
         self.generation_kwargs = generation_kwargs
 
-    async def __call__(  # type: ignore
-        self,  # type: ignore
-        record: LatentRecord,  # type: ignore
-    ) -> ScorerResult:  # type: ignore
+    async def __call__(
+        self,
+        record: LatentRecord,
+    ) -> ScorerResult:
         samples = self._prepare(record)
 
         random.shuffle(samples)
@@ -75,27 +77,22 @@ def _prepare(self, record: LatentRecord) -> list[Sample]:
         assert record.extra_examples is not None, "No extra examples provided"
         samples = examples_to_samples(
             record.extra_examples,
-            tokenizer=self.tokenizer,
-            distance=-1,
         )
 
-        for i, example in enumerate(record.test):
-            samples.extend(
-                examples_to_samples(
-                    [example],
-                    tokenizer=self.tokenizer,
-                    distance=i + 1,
-                )
+        samples.extend(
+            examples_to_samples(
+                record.test,
             )
+        )
 
         return samples
 
     def compute_loss_with_kv_cache(
         self, explanation: str, samples: list[Sample], batch_size=2
     ):
-        # print(explanation_prompt)
         model = self.model
         tokenizer = self.model.tokenizer
+        assert tokenizer is not None, "Tokenizer is not set in model.tokenizer"
         # Tokenize explanation
         tokenizer.padding_side = "right"
         tokenizer.pad_token = tokenizer.eos_token
@@ -180,20 +177,27 @@ def _query(self, explanation: str, samples: list[Sample]) -> list[SurprisalOutpu
 
 
 def examples_to_samples(
-    examples: list[Example] | list[ActivatingExample],
-    tokenizer: PreTrainedTokenizer,
-    **sample_kwargs,
+    examples: Sequence[Example],
 ) -> list[Sample]:
     samples = []
     for example in examples:
-        text = "".join(tokenizer.batch_decode(example.tokens))
+        assert isinstance(example, ActivatingExample) or isinstance(
+            example, NonActivatingExample
+        )
+        text = "".join(str(token) for token in example.str_tokens)
         activations = example.activations.tolist()
         samples.append(
             Sample(
                 text=text,
                 activations=activations,
                 data=SurprisalOutput(
-                    activations=activations, text=text, **sample_kwargs
+                    activations=activations,
+                    text=text,
+                    distance=(
+                        example.quantile
+                        if isinstance(example, ActivatingExample)
+                        else example.distance
+                    ),
                 ),
             )
         )

From 9dc9bb01e8ad2f3e12027f4f4648f97d6f24bcfd Mon Sep 17 00:00:00 2001
From: SrGonao <goncalo@eleuther.ai>
Date: Thu, 12 Jun 2025 09:15:27 -0400
Subject: [PATCH 5/7] Optional arguments

---
 delphi/latents/latents.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/delphi/latents/latents.py b/delphi/latents/latents.py
index d1190932..91a4b176 100644
--- a/delphi/latents/latents.py
+++ b/delphi/latents/latents.py
@@ -92,12 +92,12 @@ class ActivatingExample(Example):
     An example of a latent that activates a model.
     """
 
-    str_tokens: list[str]
-    """Tokenized input sequence as strings."""
-
-    normalized_activations: Float[Tensor, "ctx_len"]
+    normalized_activations: Optional[Float[Tensor, "ctx_len"]] = None
     """Activations quantized to integers in [0, 10]."""
 
+    str_tokens: Optional[list[str]] = None
+    """Tokenized input sequence as strings."""
+
     quantile: int = 0
     """The quantile of the activating example."""
 
@@ -128,7 +128,7 @@ class LatentRecord:
     """The latent associated with the record."""
 
     examples: list[ActivatingExample] = field(default_factory=list)
-    """Example sequences where the latent activations, assumed to be sorted in
+    """Example sequences where the latent activates, assumed to be sorted in
     descending order by max activation."""
 
     not_active: list[NonActivatingExample] = field(default_factory=list)

From ab7f757fe96f9d1eeb6e317b94334236498361d3 Mon Sep 17 00:00:00 2001
From: SrGonao <goncalo@eleuther.ai>
Date: Thu, 12 Jun 2025 09:15:45 -0400
Subject: [PATCH 6/7] Asserts for typehints

---
 delphi/scorers/embedding/embedding.py | 1 +
 delphi/scorers/surprisal/surprisal.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/delphi/scorers/embedding/embedding.py b/delphi/scorers/embedding/embedding.py
index 0e6f44b7..ed911866 100644
--- a/delphi/scorers/embedding/embedding.py
+++ b/delphi/scorers/embedding/embedding.py
@@ -108,6 +108,7 @@ def examples_to_samples(
         assert isinstance(example, ActivatingExample) or isinstance(
             example, NonActivatingExample
         )
+        assert example.str_tokens is not None
         text = "".join(str(token) for token in example.str_tokens)
         activations = example.activations.tolist()
         samples.append(
diff --git a/delphi/scorers/surprisal/surprisal.py b/delphi/scorers/surprisal/surprisal.py
index 1e10829c..7f42be04 100644
--- a/delphi/scorers/surprisal/surprisal.py
+++ b/delphi/scorers/surprisal/surprisal.py
@@ -184,6 +184,7 @@ def examples_to_samples(
         assert isinstance(example, ActivatingExample) or isinstance(
             example, NonActivatingExample
         )
+        assert example.str_tokens is not None
         text = "".join(str(token) for token in example.str_tokens)
         activations = example.activations.tolist()
         samples.append(

From 6e016181e77516a488ec13ec5a8062fbc3f70a65 Mon Sep 17 00:00:00 2001
From: SrGonao <goncalo@eleuther.ai>
Date: Thu, 12 Jun 2025 09:16:13 -0400
Subject: [PATCH 7/7] Fixed constructors

---
 delphi/latents/constructors.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/delphi/latents/constructors.py b/delphi/latents/constructors.py
index 95aee37c..fa857bef 100644
--- a/delphi/latents/constructors.py
+++ b/delphi/latents/constructors.py
@@ -47,7 +47,6 @@ def prepare_non_activating_examples(
         NonActivatingExample(
             tokens=toks,
             activations=acts,
-            normalized_activations=None,
             distance=distance,
             str_tokens=tokenizer.batch_decode(toks),
         )
@@ -281,7 +280,6 @@ def constructor(
         ActivatingExample(
             tokens=toks,
             activations=acts,
-            normalized_activations=None,
         )
         for toks, acts in zip(token_windows, act_windows)
     ]