In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports


In [40]:
from pathlib import Path

import torch
from torch import nn

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

In [26]:
DATA_DIR = Path("../../elk-reporters/gpt2/imdb/jovial-murdock/")
DATA_DIR

PosixPath('../../elk-reporters/gpt2/imdb/jovial-murdock')

## Load VINC model

In [27]:
!ls ../../elk-reporters/gpt2/imdb/jovial-murdock

cfg.yaml  fingerprints.yaml  lr_eval.csv  reporters	  train_lm_eval.csv
eval.csv  lm_eval.csv	     lr_models	  train_eval.csv


In [28]:
files = [f for f in (DATA_DIR / "reporters").iterdir() if f.suffix == ".pt"]
files = sorted(files, key=lambda x: int(x.stem.split("_")[-1]))
len(files), files[:5]

(13,
 [PosixPath('../../elk-reporters/gpt2/imdb/jovial-murdock/reporters/layer_0.pt'),
  PosixPath('../../elk-reporters/gpt2/imdb/jovial-murdock/reporters/layer_1.pt'),
  PosixPath('../../elk-reporters/gpt2/imdb/jovial-murdock/reporters/layer_2.pt'),
  PosixPath('../../elk-reporters/gpt2/imdb/jovial-murdock/reporters/layer_3.pt'),
  PosixPath('../../elk-reporters/gpt2/imdb/jovial-murdock/reporters/layer_4.pt')])

In [29]:
for f in files[:2]:
    print(f.name)
    tns = torch.load(f)
    print(tns["num_classes"])
    print(tns["in_features"])

    for k, v in tns.items():
        if isinstance(v, torch.Tensor):
            print(k, v.shape)
    
    print("-------------\n")

layer_0.pt
2
768
bias torch.Size([1])
scale torch.Size([1])
weight torch.Size([1, 768])
norm.mean_x torch.Size([768])
norm.mean_y torch.Size([1])
norm.u torch.Size([768, 1])
norm.x_M2 torch.Size([768])
norm.xcov_M2 torch.Size([768, 1])
norm.y_M2 torch.Size([1])
norm.n torch.Size([])
-------------

layer_1.pt
2
768
bias torch.Size([1])
scale torch.Size([1])
weight torch.Size([1, 768])
norm.mean_x torch.Size([768])
norm.mean_y torch.Size([1])
norm.u torch.Size([768, 1])
norm.x_M2 torch.Size([768])
norm.xcov_M2 torch.Size([768, 1])
norm.y_M2 torch.Size([1])
norm.n torch.Size([])
-------------



## Try and load a VINC probe

In [30]:
ELK_PATH = Path("../../elk/")
ELK_PATH.resolve()

PosixPath('/rds/user/am3052/hpc-work/elk-rlhf/elk')

In [31]:
import sys
sys.path

['/rds/user/am3052/hpc-work/elk-rlhf/src/prototyping',
 '/home/am3052/.conda/envs/elk/lib/python310.zip',
 '/home/am3052/.conda/envs/elk/lib/python3.10',
 '/home/am3052/.conda/envs/elk/lib/python3.10/lib-dynload',
 '',
 '/home/am3052/.conda/envs/elk/lib/python3.10/site-packages',
 '__editable__.eleuther_elk-0.1.1.finder.__path_hook__']

In [32]:
modules = [ELK_PATH, ELK_PATH / "elk" / "training"]

for module in modules:
    if not str(module) in sys.path:
        sys.path.insert(0, str(module.resolve()))

sys.path

['/rds/user/am3052/hpc-work/elk-rlhf/elk/elk/training',
 '/rds/user/am3052/hpc-work/elk-rlhf/elk',
 '/rds/user/am3052/hpc-work/elk-rlhf/src/prototyping',
 '/home/am3052/.conda/envs/elk/lib/python310.zip',
 '/home/am3052/.conda/envs/elk/lib/python3.10',
 '/home/am3052/.conda/envs/elk/lib/python3.10/lib-dynload',
 '',
 '/home/am3052/.conda/envs/elk/lib/python3.10/site-packages',
 '__editable__.eleuther_elk-0.1.1.finder.__path_hook__']

In [33]:
from reporter import Reporter

In [34]:
reporter_path = files[-1]
reporter_path.resolve()

PosixPath('/rds/user/am3052/hpc-work/elk-rlhf/elk-reporters/gpt2/imdb/jovial-murdock/reporters/layer_12.pt')

In [35]:
reporter = Reporter.load(reporter_path)
reporter

EigenReporter(
  (norm): SpectralNorm()
)

In [36]:
reporter.weight.shape

torch.Size([1, 768])

In [37]:
reporter.scale, reporter.bias

(Parameter containing:
 tensor([-14.0780], requires_grad=True),
 Parameter containing:
 tensor([0.0030], requires_grad=True))

## Try combining report with a language model

Inspiration taken from the [original repository](https://github.com/collin-burns/discovering_latent_knowledge/blob/main/CCS.ipynb).

In [56]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [67]:
dataset = load_dataset("amazon_polarity", split="test[:4]")
dataset

Found cached dataset amazon_polarity (/home/am3052/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/a27b32b7e7b88eb274a8fa8ba0f654f1fe998a87c22547557317793b5d2772dc)


Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 4
})

In [69]:
dataset["label"]

[1, 1, 0, 1]

In [70]:
# TODO: remove label=0
def get_prompt(text, label=0):
    sentiment = ["negative", "positive"][label]
    return f"The following movie review expresses a {sentiment} sentiment:\n{text}"

input_texts = [get_prompt(text) + tokenizer.eos_token for text in dataset["content"]]
inputs = tokenizer(input_texts, return_tensors="pt", padding=True)
inputs.keys(), inputs["input_ids"].shape

(dict_keys(['input_ids', 'attention_mask']), torch.Size([4, 169]))

In [90]:
inputs["attention_mask"].sum(dim=1)

tensor([146, 169,  74,  56])

In [107]:
class MyRewardModel(nn.Module):
    def __init__(self, language_model, reporter, layer=-1):
        super().__init__()
        
        self.language_model = language_model
        self.reporter = reporter
        self.layer = layer

    
    def forward(self, input_ids, attention_mask):
        hidden_states = self.language_model(
            input_ids, attention_mask=attention_mask,
            output_hidden_states=True,
        ).hidden_states[self.layer]
        
        # Find the index of the last non-padding token
        last_token_index = torch.sum(attention_mask, dim=1) - 1

        # Get the last token's output
        last_tokens = hidden_states[range(len(last_token_index)), last_token_index]
        print(last_tokens.shape)
        
        return self.reporter(last_tokens)

In [108]:
language_model = AutoModelForCausalLM.from_pretrained("gpt2")

my_reward_model = MyRewardModel(language_model, reporter)

In [109]:
with torch.no_grad():
    credences = my_reward_model(**inputs)

credences.shape

torch.Size([4, 768])


torch.Size([4])

In [110]:
credences

tensor([ 9.8509, 10.1446,  9.7738,  9.7631])