In [1]:
%load_ext autoreload
%autoreload 2

## Imports


In [2]:
import sys
from pathlib import Path

import torch
from torch import nn

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
ELK_PATH = Path("../../../elk/")
ELK_PATH.resolve()

PosixPath('/rds/user/am3052/hpc-work/elk')

In [4]:
modules = [ELK_PATH, ELK_PATH / "elk" / "training"]

for module in modules:
    if not str(module) in sys.path:
        sys.path.insert(0, str(module.resolve()))

sys.path[:2]

['/rds/user/am3052/hpc-work/elk/elk/training', '/rds/user/am3052/hpc-work/elk']

In [5]:
from reporter import Reporter

## Config

In [6]:
DATA_DIR = Path("../../../elk-reporters/gpt2/imdb/gracious-jennings/")
DATA_DIR

PosixPath('../../../elk-reporters/gpt2/imdb/gracious-jennings')

In [7]:
!ls {DATA_DIR}

cfg.yaml  fingerprints.yaml  lr_eval.csv  reporters	  train_lm_eval.csv
eval.csv  lm_eval.csv	     lr_models	  train_eval.csv


In [8]:
LAYER = 12

## Load VINC model

In [9]:
reporter_path = DATA_DIR / "reporters" / f"layer_{LAYER}.pt"
reporter_path.resolve()

PosixPath('/rds/user/am3052/hpc-work/elk-reporters/gpt2/imdb/gracious-jennings/reporters/layer_12.pt')

In [10]:
reporter = Reporter.load(reporter_path)
reporter

EigenReporter(
  (norm): ConceptEraser()
)

## Try combining report with a language model

Inspiration taken from the [original repository](https://github.com/collin-burns/discovering_latent_knowledge/blob/main/CCS.ipynb).

In [11]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [51]:
dataset = load_dataset("amazon_polarity", split="test[:4]")
dataset

Found cached dataset amazon_polarity (/home/am3052/.cache/huggingface/datasets/amazon_polarity/amazon_polarity/3.0.0/a27b32b7e7b88eb274a8fa8ba0f654f1fe998a87c22547557317793b5d2772dc)


Dataset({
    features: ['label', 'title', 'content'],
    num_rows: 4
})

In [52]:
def get_prompt(text, label):
    sentiment = ["negative", "positive"][label]
    return f"""Below is a movie review in triple backticks: \
        \n```\n{text}\n``` \
        \n\nThe sentiment of the review is {sentiment}
    """

print(get_prompt(dataset["content"][0], dataset["label"][0]))

Below is a movie review in triple backticks:         
```
My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"
```         

The sentiment of the review is positive
    


In [53]:
# Get input texts
pos_input_texts = [get_prompt(text, label=1) + tokenizer.eos_token for text in dataset["content"]]
neg_input_texts = [get_prompt(text, label=0) + tokenizer.eos_token for text in dataset["content"]]

# Get the inputs (input_ids and attention_masks) for the positive and negative texts
pos_inputs = tokenizer(pos_input_texts, return_tensors="pt", padding=True)
neg_inputs = tokenizer(neg_input_texts, return_tensors="pt", padding=True)

pos_inputs["input_ids"].shape, neg_inputs["input_ids"].shape

(torch.Size([4, 209]), torch.Size([4, 209]))

In [54]:
pos_inputs["attention_mask"].sum(dim=1), neg_inputs["attention_mask"].sum(dim=1)

(tensor([186, 209, 114,  96]), tensor([186, 209, 114,  96]))

In [55]:
class MyRewardModel(nn.Module):
    def __init__(self, language_model, reporter, layer=-1):
        super().__init__()
        
        self.language_model = language_model # e.g. GPT-2
        self.reporter = reporter # EigenReporter, loaded using Reporter.load(path)
        self.layer = layer # which layer to extract

    
    def forward(self, pos_inputs, neg_inputs):
        # Get the hidden states
        pos_hidden_states = self.language_model(
            **pos_inputs, output_hidden_states=True,
        ).hidden_states[self.layer]
        neg_hidden_states = self.language_model(
            **neg_inputs, output_hidden_states=True,
        ).hidden_states[self.layer]
        
        # Find the index of the last non-padding token
        pos_last_token_index = torch.sum(pos_inputs["attention_mask"], dim=1) - 1
        neg_last_token_index = torch.sum(neg_inputs["attention_mask"], dim=1) - 1

        # Get the last token's output
        pos_last_tokens = pos_hidden_states[range(len(pos_last_token_index)), pos_last_token_index]
        neg_last_tokens = neg_hidden_states[range(len(neg_last_token_index)), neg_last_token_index]

        # Get the logits for the two classes
        pos_logits = self.reporter(pos_last_tokens)
        neg_logits = self.reporter(neg_last_tokens)

        # Return the difference in logits which will later be
        # passed through a sigmoid function
        return pos_logits - neg_logits


In [56]:
language_model = AutoModelForCausalLM.from_pretrained("gpt2")
reporter = Reporter.load(reporter_path)

my_reward_model = MyRewardModel(language_model, reporter)

In [57]:
with torch.no_grad():
    credences = my_reward_model(pos_inputs, neg_inputs)

outputs = torch.sigmoid(credences)
outputs

tensor([0.5005, 0.5015, 0.4993, 0.5002])

In [58]:
# Compare ground-truth with predictions
dataset["label"], (outputs > 0.5).int().tolist()

([1, 1, 0, 1], [1, 1, 0, 1])