EleutherAI · norabelrose · Jul 10, 2023 · Jun 8, 2023 · Jun 11, 2023 · Jun 14, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,4 +24,4 @@ repos:
     hooks:
     -   id: codespell
         # The promptsource templates spuriously get flagged without this
-        args: ["-L fpr", "--skip=*.yaml"]
+        args: ["-L fpr,leace", "--skip=*.yaml"]
diff --git a/elk/__init__.py b/elk/__init__.py
@@ -1,10 +1,10 @@
 from .extraction import Extract, extract_hiddens
-from .training import EigenReporter, EigenReporterConfig
+from .training import EigenFitter, EigenFitterConfig
 from .truncated_eigh import truncated_eigh
 
 __all__ = [
-    "EigenReporter",
-    "EigenReporterConfig",
+    "EigenFitter",
+    "EigenFitterConfig",
     "extract_hiddens",
     "Extract",
     "truncated_eigh",

diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
@@ -9,7 +9,6 @@
 from ..files import elk_reporter_dir
 from ..metrics import evaluate_preds
 from ..run import Run
-from ..training import Reporter
 from ..utils import Color
 
 
@@ -40,8 +39,7 @@ def apply_to_layer(
         experiment_dir = elk_reporter_dir() / self.source
 
         reporter_path = experiment_dir / "reporters" / f"layer_{layer}.pt"
-        reporter = Reporter.load(reporter_path, map_location=device)
-        reporter.eval()
+        reporter = torch.load(reporter_path, map_location=device)
 
         row_bufs = defaultdict(list)
         for ds_name, (val_h, val_gt, _) in val_output.items():

diff --git a/elk/training/__init__.py b/elk/training/__init__.py
@@ -1,16 +1,15 @@
-from .ccs_reporter import CcsReporter, CcsReporterConfig
+from .ccs_reporter import CcsConfig, CcsReporter
 from .classifier import Classifier
-from .concept_eraser import ConceptEraser
-from .eigen_reporter import EigenReporter, EigenReporterConfig
-from .reporter import Reporter, ReporterConfig
+from .common import FitterConfig
+from .eigen_reporter import EigenFitter, EigenFitterConfig
+from .platt_scaling import PlattMixin
 
 __all__ = [
     "CcsReporter",
-    "CcsReporterConfig",
+    "CcsConfig",
     "Classifier",
-    "ConceptEraser",
-    "EigenReporter",
-    "EigenReporterConfig",
-    "Reporter",
-    "ReporterConfig",
+    "EigenFitter",
+    "EigenFitterConfig",
+    "FitterConfig",
+    "PlattMixin",
 ]
diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
@@ -3,71 +3,61 @@
 import math
 from copy import deepcopy
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import Literal, Optional, cast
 
 import torch
 import torch.nn as nn
+from concept_erasure import LeaceFitter
 from torch import Tensor
 
 from ..parsing import parse_loss
 from ..utils.typing import assert_type
-from .concept_eraser import ConceptEraser
+from .common import FitterConfig
 from .losses import LOSSES
-from .reporter import Reporter, ReporterConfig
+from .platt_scaling import PlattMixin
 
 
 @dataclass
-class CcsReporterConfig(ReporterConfig):
-    """
-    Args:
-        activation: The activation function to use. Defaults to GELU.
-        bias: Whether to use a bias term in the linear layers. Defaults to True.
-        hidden_size: The number of hidden units in the MLP. Defaults to None.
-            By default, use an MLP expansion ratio of 4/3. This ratio is used by
-            Tucker et al. (2022) <https://arxiv.org/abs/2204.09722> in their 3-layer
-            MLP probes. We could also use a ratio of 4, imitating transformer FFNs,
-            but this seems to lead to excessively large MLPs when num_layers > 2.
-        init: The initialization scheme to use. Defaults to "zero".
-        loss: The loss function to use. list of strings, each of the form
-            "coef*name", where coef is a float and name is one of the keys in
-            `elk.training.losses.LOSSES`.
-            Example: --loss 1.0*consistency_squared 0.5*prompt_var
-            corresponds to the loss function 1.0*consistency_squared + 0.5*prompt_var.
-            Defaults to the loss "ccs_squared_loss".
-        normalization: The kind of normalization to apply to the hidden states.
-        num_layers: The number of layers in the MLP. Defaults to 1.
-        pre_ln: Whether to include a LayerNorm module before the first linear
-            layer. Defaults to False.
-        supervised_weight: The weight of the supervised loss. Defaults to 0.0.
-
-        lr: The learning rate to use. Ignored when `optimizer` is `"lbfgs"`.
-            Defaults to 1e-2.
-        num_epochs: The number of epochs to train for. Defaults to 1000.
-        num_tries: The number of times to try training the reporter. Defaults to 10.
-        optimizer: The optimizer to use. Defaults to "adam".
-        weight_decay: The weight decay or L2 penalty to use. Defaults to 0.01.
-    """
-
+class CcsConfig(FitterConfig):
     activation: Literal["gelu", "relu", "swish"] = "gelu"
+    """The activation function to use."""
     bias: bool = True
+    """Whether to use a bias term in the linear layers."""
     hidden_size: Optional[int] = None
+    """
+    The number of hidden units in the MLP. Defaults to None. By default, use an MLP
+    expansion ratio of 4/3. This ratio is used by Tucker et al. (2022)
+    <https://arxiv.org/abs/2204.09722> in their 3-layer MLP probes. We could also use
+    a ratio of 4, imitating transformer FFNs, but this seems to lead to excessively
+    large MLPs when num_layers > 2.
+    """
     init: Literal["default", "pca", "spherical", "zero"] = "default"
+    """The initialization scheme to use."""
     loss: list[str] = field(default_factory=lambda: ["ccs"])
+    """
+    The loss function to use. list of strings, each of the form "coef*name", where coef
+    is a float and name is one of the keys in `elk.training.losses.LOSSES`.
+    Example: `--loss 1.0*consistency_squared 0.5*prompt_var` corresponds to the loss
+    function 1.0*consistency_squared + 0.5*prompt_var.
+    """
     loss_dict: dict[str, float] = field(default_factory=dict, init=False)
     num_layers: int = 1
+    """The number of layers in the MLP."""
     pre_ln: bool = False
+    """Whether to include a LayerNorm module before the first linear layer."""
     supervised_weight: float = 0.0
+    """The weight of the supervised loss."""
 
     lr: float = 1e-2
+    """The learning rate to use. Ignored when `optimizer` is `"lbfgs"`."""
     num_epochs: int = 1000
+    """The number of epochs to train for."""
     num_tries: int = 10
+    """The number of times to try training the reporter."""
     optimizer: Literal["adam", "lbfgs"] = "lbfgs"
+    """The optimizer to use."""
     weight_decay: float = 0.01
-
-    @classmethod
-    def reporter_class(cls) -> type[Reporter]:
-        return CcsReporter
+    """The weight decay or L2 penalty to use."""
 
     def __post_init__(self):
         self.loss_dict = parse_loss(self.loss)
@@ -76,19 +66,19 @@ def __post_init__(self):
         self.loss = [f"{coef}*{name}" for name, coef in self.loss_dict.items()]
 
 
-class CcsReporter(Reporter):
+class CcsReporter(nn.Module, PlattMixin):
     """CCS reporter network.
 
     Args:
         in_features: The number of input features.
         cfg: The reporter configuration.
     """
 
-    config: CcsReporterConfig
+    config: CcsConfig
 
     def __init__(
         self,
-        cfg: CcsReporterConfig,
+        cfg: CcsConfig,
         in_features: int,
         *,
         device: str | torch.device | None = None,
@@ -106,12 +96,7 @@ def __init__(
 
         hidden_size = cfg.hidden_size or 4 * in_features // 3
 
-        self.norm = ConceptEraser(
-            in_features,
-            2 * num_variants,
-            device=device,
-            dtype=dtype,
-        )
+        self.norm = None
         self.probe = nn.Sequential(
             nn.Linear(
                 in_features,
@@ -175,6 +160,8 @@ def reset_parameters(self):
 
     def forward(self, x: Tensor) -> Tensor:
         """Return the credence assigned to the hidden state `x`."""
+        assert self.norm is not None, "Must call fit() before forward()"
+
         raw_scores = self.probe(self.norm(x)).squeeze(-1)
         return raw_scores.mul(self.scale).add(self.bias).squeeze(-1)
 
@@ -203,19 +190,22 @@ def fit(self, hiddens: Tensor) -> float:
         x_neg, x_pos = hiddens.unbind(2)
 
         # One-hot indicators for each prompt template
-        n, v, _ = x_neg.shape
+        n, v, d = x_neg.shape
         prompt_ids = torch.eye(v, device=x_neg.device).expand(n, -1, -1)
 
-        self.norm.update(
+        fitter = LeaceFitter(d, 2 * v, dtype=x_neg.dtype, device=x_neg.device)
+        fitter.update(
             x=x_neg,
             # Independent indicator for each (template, pseudo-label) pair
-            y=torch.cat([torch.zeros_like(prompt_ids), prompt_ids], dim=-1),
+            z=torch.cat([torch.zeros_like(prompt_ids), prompt_ids], dim=-1),
         )
-        self.norm.update(
+        fitter.update(
             x=x_pos,
             # Independent indicator for each (template, pseudo-label) pair
-            y=torch.cat([prompt_ids, torch.zeros_like(prompt_ids)], dim=-1),
+            z=torch.cat([prompt_ids, torch.zeros_like(prompt_ids)], dim=-1),
         )
+        self.norm = fitter.eraser
+
         x_neg, x_pos = self.norm(x_neg), self.norm(x_pos)
 
         # Record the best acc, loss, and params found so far
@@ -299,9 +289,3 @@ def closure():
 
         optimizer.step(closure)
         return float(loss)
-
-    def save(self, path: Path | str) -> None:
-        """Save the reporter to a file."""
-        state = {k: v.cpu() for k, v in self.state_dict().items()}
-        state.update(in_features=self.in_features, num_variants=self.num_variants)
-        torch.save(state, path)
diff --git a/elk/training/common.py b/elk/training/common.py
@@ -0,0 +1,31 @@
+"""An ELK reporter network."""
+
+from dataclasses import dataclass
+
+from concept_erasure import LeaceEraser
+from simple_parsing.helpers import Serializable
+from torch import Tensor, nn
+
+from .platt_scaling import PlattMixin
+
+
+@dataclass
+class FitterConfig(Serializable, decode_into_subclasses=True):
+    seed: int = 42
+    """The random seed to use."""
+
+
+@dataclass
+class Reporter(PlattMixin):
+    weight: Tensor
+    eraser: LeaceEraser
+
+    def __post_init__(self):
+        # Platt scaling parameters
+        self.bias = nn.Parameter(self.weight.new_zeros(1))
+        self.scale = nn.Parameter(self.weight.new_ones(1))
+
+    def __call__(self, hiddens: Tensor) -> Tensor:
+        """Return the predicted log odds on input `x`."""
+        raw_scores = self.eraser(hiddens) @ self.weight.mT
+        return raw_scores.mul(self.scale).add(self.bias).squeeze(-1)
diff --git a/elk/training/concept_eraser.py b/elk/training/concept_eraser.py