## Imports


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

import torch
from torch import nn

from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [3]:
ELK_PATH = Path("../../../elk/")
print(ELK_PATH.resolve())

modules = [
    ELK_PATH,
    ELK_PATH / "elk" / "training",
    ELK_PATH / "elk" / "promptsource",
]

for module in modules:
    if not str(module) in sys.path:
        sys.path.insert(0, str(module.resolve()))

print(sys.path[:3])

from reporter import Reporter
from templates import DatasetTemplates

/fsx/home-augustas/elk
['/fsx/home-augustas/elk/elk/promptsource', '/fsx/home-augustas/elk/elk/training', '/fsx/home-augustas/elk']


## Config

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
DATA_DIR = (
    "../../../VINC-logs/"
    "allenai/unifiedqa-v2-t5-11b-1363200/"
    "AugustasM/burns-datasets-VINC/strange-montalcini"
)
DATA_DIR = Path(DATA_DIR)

!ls {DATA_DIR}

cfg.yaml  fingerprints.yaml  lr_eval.csv  reporters	  train_lm_eval.csv
eval.csv  lm_eval.csv	     lr_models	  train_eval.csv


In [6]:
model_name = "allenai/unifiedqa-v2-t5-11b-1363200"
model_name = "allenai/unifiedqa-v2-t5-3b-1363200"

LAYER = 24

## Dataset

In [7]:
dataset = load_dataset("AugustasM/burns-datasets-VINC", split="validation[:64]")
dataset

Found cached dataset parquet (/admin/home-augustas/.cache/huggingface/datasets/AugustasM___parquet/AugustasM--burns-datasets-VINC-85ec467026b56702/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


Dataset({
    features: ['text', 'label', 'original_dataset', 'template_name'],
    num_rows: 64
})

In [8]:
dataset_template_path = "AugustasM/burns-datasets-VINC/all"

dataset_templates = DatasetTemplates(dataset_template_path)
dataset_templates.templates = {
    x.name: x for x in dataset_templates.templates.values() if x.get_answer_choices_list(dataset[0]) is not None
}
print(dataset_templates.templates)

template = list(dataset_templates.templates.values())[0]
template

{'Consider the text below': <templates.Template object at 0x7f213f2c5de0>, 'Is the given text truthful': <templates.Template object at 0x7f213f2c5e10>, 'Text first': <templates.Template object at 0x7f213f2c5e40>, 'No question no choices': <templates.Template object at 0x7f213f2c4c70>, 'No question with choices': <templates.Template object at 0x7f213f2c5d80>}


<templates.Template at 0x7f213f2c5de0>

## Tokenizer

In [9]:
model_name

'allenai/unifiedqa-v2-t5-3b-1363200'

In [10]:
tokenizer = T5Tokenizer.from_pretrained(model_name, truncation_side="left")
tokenizer

T5Tokenizer(name_or_path='allenai/unifiedqa-v2-t5-3b-1363200', vocab_size=32100, model_max_length=512, is_fast=False, padding_side='right', truncation_side='left', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<ex

## Try combining report with a language model

Inspiration taken from the [original repository](https://github.com/collin-burns/discovering_latent_knowledge/blob/main/CCS.ipynb).

In [21]:
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
model.eval();

In [11]:
# class MyRewardModel(nn.Module):
#     def __init__(
#             self, language_model_name, reporter_path,
#             layer=-1, device="cpu", hidden_state_name="decoder_hidden_states",
#         ):
#         super().__init__()

#         # Load the language model and the reporter
#         self.language_model = T5ForConditionalGeneration.from_pretrained(
#             language_model_name
#         ).to(device)
#         self.language_model.eval()

#         self.reporter = Reporter.load(reporter_path).to(device)
#         self.reporter.eval()

#         self.layer = layer # which layer to extract
#         self.hidden_state_name = hidden_state_name

    
#     def forward(self, pos_inputs, neg_inputs):
#         # Get the hidden states
#         pos_hidden_states = self.language_model(
#             **pos_inputs, output_hidden_states=True,
#         )[self.hidden_state_name][self.layer]
#         neg_hidden_states = self.language_model(
#             **neg_inputs, output_hidden_states=True,
#         )[self.hidden_state_name][self.layer]
        
#         # Find the index of the last non-padding token
#         pos_last_token_index = torch.sum(pos_inputs["attention_mask"], dim=1) - 1
#         neg_last_token_index = torch.sum(neg_inputs["attention_mask"], dim=1) - 1

#         # Get the last token's output
#         pos_last_tokens = pos_hidden_states[range(len(pos_last_token_index)), pos_last_token_index]
#         neg_last_tokens = neg_hidden_states[range(len(neg_last_token_index)), neg_last_token_index]

#         # Get the logits for the two classes
#         pos_logits = self.reporter(pos_last_tokens)
#         neg_logits = self.reporter(neg_last_tokens)

#         # Return the difference in logits which will later be
#         # passed through a sigmoid function
#         return pos_logits - neg_logits


In [42]:
class MyRewardModel(nn.Module):
    def __init__(
            self, language_model, reporter_path,
            layer=-1, device="cpu", hidden_state_name="decoder_hidden_states",
        ):
        super().__init__()

        # Load the language model and the reporter
        # self.language_model = T5ForConditionalGeneration.from_pretrained(
        #     language_model
        # ).to(device)
        # self.language_model.eval()
        self.language_model = language_model

        self.reporter = Reporter.load(reporter_path).to(device)
        self.reporter.eval()

        self.layer = layer # which layer to extract
        self.hidden_state_name = hidden_state_name

    
    def forward(self, pos_inputs, neg_inputs):
        '''
            NOTE: only works for a single input at a time for now
        '''

        # Get the hidden states
        pos_hidden_states = self.language_model(
            **pos_inputs, output_hidden_states=True,
        )[self.hidden_state_name][self.layer]
        neg_hidden_states = self.language_model(
            **neg_inputs, output_hidden_states=True,
        )[self.hidden_state_name][self.layer]

        # Get the last token's output
        # Shape B x T x H -> B x H
        pos_last_tokens = pos_hidden_states[:, -1, :]
        neg_last_tokens = neg_hidden_states[:, -1, :]

        # Get the logits for the two classes
        # Shape B x H -> B
        pos_logits = self.reporter(pos_last_tokens)
        neg_logits = self.reporter(neg_last_tokens)

        # Return the difference in logits which will later be
        # passed through a sigmoid function
        return pos_logits - neg_logits


In [43]:
reporter_path = DATA_DIR / "reporters" / f"layer_{LAYER}.pt"
reporter_path.resolve()

PosixPath('/fsx/home-augustas/VINC-logs/allenai/unifiedqa-v2-t5-11b-1363200/AugustasM/burns-datasets-VINC/strange-montalcini/reporters/layer_24.pt')

In [44]:
%%time

reward_model = MyRewardModel(model, reporter_path, layer=LAYER, device=device)

CPU times: user 79.3 ms, sys: 0 ns, total: 79.3 ms
Wall time: 8.24 ms


In [45]:
item = dataset[0]
item_copy = item.copy()

# Get the positive and negative examples
item_copy["label"] = 1
pos_q, pos_a = template.apply(item_copy)

item_copy["label"] = 0
neg_q, neg_a = template.apply(item_copy)

# Tokenize the inputs
pos_inputs = tokenization_function(pos_q, pos_a).to(device)
neg_inputs = tokenization_function(neg_q, neg_a).to(device)

In [46]:
with torch.no_grad():
    outputs = reward_model(pos_inputs, neg_inputs)

outputs

tensor([0.2552], device='cuda:0')

### Sift through the data

In [47]:
tokenization_function = lambda q, a: tokenizer(
    q, text_target=a.strip(),
    add_special_tokens=True, return_tensors="pt",
)

In [49]:
%%time

predictions = []
labels = []
for idx, item in enumerate(dataset):
    labels.append(item["label"])
    item_copy = item.copy()

    # Get the positive and negative examples
    item_copy["label"] = 1
    pos_q, pos_a = template.apply(item_copy)

    item_copy["label"] = 0
    neg_q, neg_a = template.apply(item_copy)

    # Tokenize the inputs
    pos_inputs = tokenization_function(pos_q, pos_a).to(device)
    neg_inputs = tokenization_function(neg_q, neg_a).to(device)

    with torch.no_grad():
        prediction = reward_model(pos_inputs, neg_inputs)
    
    predictions.append(prediction.item())

predictions = torch.tensor(predictions)
labels = torch.tensor(labels)
predictions[:5], labels[:5]

CPU times: user 5.88 s, sys: 28.1 ms, total: 5.9 s
Wall time: 6.2 s


(tensor([0.2552, 0.3842, 0.3257, 0.2843, 0.3764]), tensor([0, 0, 0, 1, 1]))