# Look for questions where the LM confidently disagrees with the reporter
- PopQA high-popularity questions might be good, but they don't have any distractors
  - We could probably just substitute in arbitrary answers from a question with the same relationship type and popularity (this is basically a custom counterfact)
- SciQ with contextual documents
- A little bit of amazon polarity
- Perhaps start with my hand-written easy dataset and ask a LM to generate more similar ones

It will make it easier to identify patterns if we keep each distribution separate at evaluation time (and maybe even during training)


In [1]:
import torch
import numpy as np
import torch
import random

seed = 633
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed);

In [2]:
# generate dataset from counterfact
from datasets import load_dataset, Dataset

# ds_name = "NeelNanda/counterfact-tracing"
ds_name = "akariasai/PopQA"
orig_dataset: Dataset = load_dataset(ds_name, split="test").shuffle(seed=seed)  # type: ignore

# dataset = dataset.map(map_fn, batched=True, batch_size=1, remove_columns=dataset.column_names)
orig_dataset

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset csv (/mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-6aec342ca94994b1.arrow


Dataset({
    features: ['id', 'subj', 'prop', 'obj', 'subj_id', 'prop_id', 'obj_id', 's_aliases', 'o_aliases', 's_uri', 'o_uri', 's_wiki_title', 'o_wiki_title', 's_pop', 'o_pop', 'question', 'possible_answers'],
    num_rows: 14267
})

In [3]:
# process the popQA dataset
# first select only the examples with s_pop in the upper 25% of the distribution
# then, for each example, find another example with the same relationship type (in the upper 25% of the distribution)
# and use this as a distractor object for that example, as a new column in the dataset

s_pop_cutoff = np.percentile(orig_dataset["s_pop"], 90)
pop_dataset = orig_dataset.filter(lambda x: x["s_pop"] >= s_pop_cutoff)

def add_distractor(example):
    distractor_candidates = pop_dataset.filter(lambda x: (x["prop_id"] == example["prop_id"]) and (x["id"] != example["id"]))
    
    try:
        distractor = np.random.choice(distractor_candidates)
        dist_obj, dist_obj_id, dist_o_pop, dist_o_aliases = distractor["obj"], distractor["obj_id"], distractor["o_pop"], distractor["o_aliases"]
    except ValueError:
        dist_obj, dist_obj_id, dist_o_pop, dist_o_aliases = "42", None, None, []
        print("No distractor found for example", example["id"], "filled with \"42\"")
    return {"dist_obj": dist_obj, "dist_obj_id": dist_obj_id, "dist_o_pop": dist_o_pop, "dist_o_aliases": dist_o_aliases}

pop_dataset = pop_dataset.map(add_distractor)
pop_dataset


Loading cached processed dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-e686124b57873689.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-32548bf0f8bbd802.arrow


Dataset({
    features: ['id', 'subj', 'prop', 'obj', 'subj_id', 'prop_id', 'obj_id', 's_aliases', 'o_aliases', 's_uri', 'o_uri', 's_wiki_title', 'o_wiki_title', 's_pop', 'o_pop', 'question', 'possible_answers', 'dist_obj', 'dist_obj_id', 'dist_o_pop', 'dist_o_aliases'],
    num_rows: 1427
})

In [4]:
q_templates = {
    22: "What is {}'s occupation?",
    218: "In what city was {} born?",
    91: "What genre is {}?",
    257: "Who is the father of {}?",
    182: "In what country is {}?",
    164: "Who was the producer of {}?",
    526: "Who was the director of {}?",
    97: "What is {} the capital of?",
    533: "Who was the screenwriter for {}?",
    639: "Who was the composer of {}?",
    472: "What color is {}?",
    106: "What is the religion of {}?",
    560: "What sport does {} play?",
    484: "Who is the author of {}?",
    292: "Who is the mother of {}?",
    422: "What is the capital of {}?"
}
# q_templates = {
#     22: "{}'s occupation is",
#     218: "The city of birth of {} is",
#     91: "The genre of {} is",
#     257: "The father of {} is",
#     182: "{} is located in the country",
#     164: "The producer of {} was",
#     526: "The director of {} was",
#     97: "{} is the capital of",
#     533: "The screenwriter for {} was",
#     639: "The composer of {} was",
#     472: "The color of {} is",
#     106: "The religion of {} is",
#     560: "The sport played by {} is",
#     484: "The author of {} is",
#     292: "The mother of {} is",
#     422: "The capital of {} is"
# }

def get_labeled_texts(example, bos_token, few_shot_prefix=None):
    """few-shot-prefix is directly prepended to the prompt, without a newline, if provided"""
    prefix = ""
    if few_shot_prefix:
        prefix = few_shot_prefix + prefix
    prop_id = example["prop_id"]
    q = q_templates[prop_id].format(example["subj"]) + " "
    prompt = bos_token + prefix + q

    suffix =  "\n\nIs this true?"
    text_true = prompt + example["obj"] + suffix
    text_false = prompt + example["dist_obj"] + suffix  # distractor object
    return {"texts": [text_true, text_false], "labels": [1, 0]}

def get_few_shot_prefix(examples):
    example_to_str = lambda example: example["text"] + " " + ("No", "Yes")[example["label"]] + "\n\n"
    example_strs = []
    for example in examples:
        exs = get_labeled_texts(example, bos_token="")
                
        # dict of list into list of dict
        exs = [dict(zip(["text", "label"], t)) for t in zip(*exs.values())]
        
        example_strs.extend(example_to_str(ex) for ex in exs)
    np.random.shuffle(example_strs)
    return "".join(example_strs)


In [5]:
n_total = 1400
# texts = np.array(dataset[:n_total]["texts"])
# labels = np.array(dataset[:n_total]["labels"])
dataset = pop_dataset.select(range(n_total))
n_shots = 10 // 2  # 5 examples per class, 2 classes
few_shot_set = pop_dataset.select(range(n_total, n_total + 20))


In [6]:
# load a model and tokenizer huggingface's transformers library
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
# model_name = "huggyllama/llama-7b"
# model_name = "gpt2-xl"
model_name = "/mnt/ssd-2/nora/vicuna-original-13b"
is_llama = "llama" in model_name or "vicuna" in model_name
device = "cuda:5"
tokenizer = LlamaTokenizer.from_pretrained(model_name, add_prefix_space=False) if is_llama else AutoTokenizer.from_pretrained(model_name, add_prefix_space=False)
model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map={"": device}) if is_llama \
    else AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map={"": device})


Loading checkpoint shards: 100%|██████████| 3/3 [00:15<00:00,  5.04s/it]


In [7]:
from tqdm import tqdm

In [8]:
from itertools import islice
import json

is_corrects = []
is_binary_corrects = []
binary_lm_probs = []
binary_labels = []
binary_texts = []
n_slice = 200
with torch.no_grad():
    for example in tqdm(islice(dataset, n_slice), total=n_slice):
        few_shot_prefix = "\n\n".join([q + " " + o for q, o in zip(few_shot_set["question"], few_shot_set["obj"])]) + "\n\n"
        question = few_shot_prefix + example["question"]
        obj = example["obj"]
        objs = [obj] + json.loads(example["o_aliases"])
        inputs = tokenizer(question, return_tensors="pt").to(device)
        # generate from model
        outputs = model.generate(**inputs, max_length=len(inputs["input_ids"][0]) + 10, do_sample=False, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        is_correct = False
        for o in objs:
            if o.lower() in pred.lower():
                is_correct = True
                break
        is_corrects.append(is_correct)

        # binary classification
        few_shot_prefix = get_few_shot_prefix(few_shot_set.shuffle(seed=seed).select(range(n_shots))) if n_shots > 0 else None
        labeled_texts = get_labeled_texts(example, tokenizer.bos_token, few_shot_prefix=few_shot_prefix)
        for text, label in zip(labeled_texts["texts"], labeled_texts["labels"]):
            tokenized_text = tokenizer(text, return_tensors="pt").to("cuda")
            outputs = model(**tokenized_text, output_hidden_states=True)
            
            hidden_states = outputs["hidden_states"]  # a tuple of torch tensors, one for each layer
            # hiddens[i, :, :] = torch.cat(hidden_states).cpu().numpy()[:, -1]  # all layers, last token
            
            # get prob assigned to each target
            noyes_tokens = ["No", "Yes"] if is_llama else [" No", " Yes"]
            no_id, yes_id = tokenizer(noyes_tokens, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(1)
            p_no, p_yes = outputs["logits"][0, -1, [no_id, yes_id]].softmax(dim=-1).cpu().numpy()
            
            binary_lm_probs.append(p_no / (p_yes + p_no))  # probability of "no" (no error)
            is_binary_corrects.append(label == (p_no < p_yes))
            binary_labels.append(label)
            binary_texts.append(text)
        # print()
        # print("Gold:", obj)
        # print("-" * 50)
np.mean(is_corrects), 2 * np.std(is_corrects) / np.sqrt(n_slice)

  0%|          | 0/200 [00:00<?, ?it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  0%|          | 1/200 [00:02<08:07,  2.45s/it]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  1%|          | 2/200 [00:03<05:00,  1.52s/it]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  2%|▏         | 3/200 [00:04<04:00,  1.22s/it]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740

(0.615, 0.06881496930174422)

In [13]:
print(f"Binary classification accuracy: {np.mean(is_binary_corrects):.3f} ± {2 * np.std(is_binary_corrects) / np.sqrt(len(is_binary_corrects)):.3f}")
print(f"LM free generation accuracy: {np.mean(is_corrects):.3f} ± {2 * np.std(is_corrects) / np.sqrt(n_slice):.3f}")

Binary classification accuracy: 0.815 ± 0.039
LM free generation accuracy: 0.615 ± 0.069


In [14]:
print("Example few-shot prefix:")
print(binary_texts[0])

Example few-shot prefix:
<s>Who was the producer of 19? Debra Hill

Is this true? No

Who was the screenwriter for The Fly? John Landis

Is this true? No

In what city was Rafael Reyes born? Canaan

Is this true? No

In what city was Rafael Reyes born? Cotija de la Paz

Is this true? Yes

Who was the producer of 19? Jim Abbiss

Is this true? Yes

What is the religion of Kumail Nanjiani? atheism

Is this true? Yes

Who is the mother of Otto von Habsburg? Zita of Bourbon-Parma

Is this true? Yes

What is the religion of Kumail Nanjiani? Episcopal Church

Is this true? No

Who is the mother of Otto von Habsburg? Sirikit

Is this true? No

Who was the screenwriter for The Fly? George Langelaan

Is this true? Yes

Who was the director of Legion? Scott Stewart

Is this true?


In [16]:
model.config.num_hidden_layers

40

In [17]:
def gather_logprobs(outputs, tokenized_text):
    # returns a [n_tokens,] numpy array of logprobs
    logprobs = outputs["logits"].log_softmax(dim=-1)
    return torch.gather(logprobs, 2, tokenized_text.input_ids.unsqueeze(2)).squeeze(2).squeeze(0)

def get_hiddens(dataset, few_shot_set):
    # run the model and get the hidden states at each layer
    # creates a len(texts) x n_layers x hidden_dimension array of zeros
    n_layer = model.config.num_hidden_layers
    hidden_size = model.config.hidden_size
    hiddens = np.zeros((2 * len(dataset), n_layer + 1, hidden_size))
    lm_probs = np.zeros((2 * len(dataset),))
    texts = np.zeros((2 * len(dataset),), dtype=object)
    labels = np.zeros((2 * len(dataset),), dtype=int)
    i = 0
    with torch.no_grad():
        for example in tqdm(dataset, total=n_total):
            few_shot_prefix = get_few_shot_prefix(few_shot_set.shuffle(seed=seed).select(range(n_shots))) if n_shots > 0 else None
            labeled_texts = get_labeled_texts(example, tokenizer.bos_token, few_shot_prefix=few_shot_prefix)
            for text, label in zip(labeled_texts["texts"], labeled_texts["labels"]):
                tokenized_text = tokenizer(text, return_tensors="pt").to("cuda")
                outputs = model(**tokenized_text, output_hidden_states=True)
                
                hidden_states = outputs["hidden_states"]  # a tuple of torch tensors, one for each layer
                hiddens[i, :, :] = torch.cat(hidden_states).cpu().numpy()[:, -1]  # all layers, last token
                
                # get prob assigned to each target
                noyes_tokens = ["No", "Yes"] if is_llama else [" No", " Yes"]
                no_id, yes_id = tokenizer(noyes_tokens, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(1)
                p_no, p_yes = outputs["logits"][0, -1, [no_id, yes_id]].softmax(dim=-1).cpu().numpy()
                
                lm_probs[i] = p_yes / (p_yes + p_no)
                labels[i] = label
                texts[i] = text
                i += 1
    return hiddens, lm_probs, labels, texts
hiddens, lm_probs, labels, texts = get_hiddens(dataset, few_shot_set)

  0%|          | 0/1400 [00:00<?, ?it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  0%|          | 1/1400 [00:00<07:59,  2.92it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  0%|          | 2/1400 [00:00<08:06,  2.87it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  0%|          | 3/1400 [00:01<08:14,  2.83it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325

In [18]:
model.config.hidden_size

5120

In [20]:
# make a train/test split and keep them separate
shuffled_idxs = np.random.permutation(np.arange(len(hiddens)))
shuffled_hiddens = hiddens[shuffled_idxs]
shuffled_labels = labels[shuffled_idxs]
train_size = int(len(shuffled_hiddens) * 0.7)
train_hiddens = shuffled_hiddens[:train_size]
test_hiddens = shuffled_hiddens[train_size:]
train_labels = shuffled_labels[:train_size]
test_labels = shuffled_labels[train_size:]
train_texts = texts[shuffled_idxs][:train_size]
test_texts = texts[shuffled_idxs][train_size:]
test_lm_probs = lm_probs[shuffled_idxs][train_size:]

# train a classifier on the hidden states
from sklearn.linear_model import LogisticRegressionCV
# use cross-validation to find the best hyperparameters
# use the best hyperparameters to train a final model
Cs = 10 ** np.linspace(-5, 5, 11)
n_layer = model.config.num_hidden_layers
layer = n_layer // 2 + 1  # the layer to use for classification, somewhat arbitrary but middle layers work better
reporter = LogisticRegressionCV(Cs=Cs, cv=2).fit(train_hiddens[:, layer, :], train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# get reporter regularization parameters
print("best regularization parameter:", reporter.C_[0])
# get model predictions on the test set
test_preds = reporter.predict(test_hiddens[:, layer, :])
test_scores = reporter.predict_proba(test_hiddens[:, layer, :])[:, 1]
confidence_threshold = 0.9
reporter_confident = np.abs(test_scores - 0.5) > (confidence_threshold - 0.5)

is_correct = test_preds == test_labels
acc = np.mean(is_correct)
stderr = 0.5 / np.sqrt(len(test_labels))  # SE_prop = sqrt(p(1-p)/n)
print(f"Accuracy: {acc:.3f} ± {stderr:.3f}")
correct_examples = test_texts[is_correct]
incorrect_examples = test_texts[~is_correct]

# train acc
train_preds = reporter.predict(train_hiddens[:, layer, :])
train_acc = np.mean(train_preds == train_labels)
print(f"Train accuracy: {train_acc:.3f}")

# analyze these examples to see what the reporter is getting right and wrong...

best regularization parameter: 0.001
Accuracy: 0.868 ± 0.017
Train accuracy: 0.903


In [48]:
sum(lm_probs > 0.5) / len(lm_probs)

0.4767857142857143

In [49]:
# lm accuracy
cal_thresh = np.quantile(test_lm_probs, test_labels.mean())  # the LM usually just guesses "no error", so we calibrate it to the true proportion of "no error" examples
lm_preds = test_lm_probs > cal_thresh
lm_acc = np.mean(lm_preds == test_labels)
lm_stderr = 0.5 / np.sqrt(len(test_labels))  # SE_prop = sqrt(p(1-p)/n).
print(f"LM Accuracy: {lm_acc:.3f} ± {lm_stderr:.3f}")

LM Accuracy: 0.822 ± 0.017


In [50]:
def print_texts(texts):
    for text in texts:
        # remove the few-shot prefix
        last = "\n\n".join(text.split("\n\n")[-2:])
        print(last)
        print("-" * 50)

In [59]:
print(f"Results for {model_name} on {ds_name} with {2 * n_shots} shots:")
print(f"Example input:\n{texts[0]}")
print("-" * 50)
print()
print(f"LM accuracy: {lm_acc:.3f} ± {lm_stderr:.3f}")
print(f"Train reporter accuracy (layer {layer}): {train_acc:.3f}")
print(f"Reporter accuracy (layer {layer}): {acc:.3f} ± {stderr:.3f}")
print()

lm_correct = (test_lm_probs > 0.5) == test_labels
reporter_correct = test_preds == test_labels

lm_better = lm_correct & ~reporter_correct
reporter_better = ~lm_correct & reporter_correct
unequal = lm_correct != reporter_correct
print(f"The LM and reporter disagree on {unequal.sum()} examples out of {len(test_lm_probs)}")
print(f"The LM is better on {lm_better.sum()} examples")
print(f"The reporter is better on {reporter_better.sum()} examples")
print()
lm_confident = np.abs(test_lm_probs - 0.5) > (confidence_threshold - 0.5)
print(f"LM is confident on {sum(lm_confident)} examples out of {len(test_lm_probs)} (threshold score = {confidence_threshold:.2f})")
print(f"Reporter is confident on {sum(reporter_confident)} examples out of {len(test_lm_probs)}")
conf_disagree = lm_confident & reporter_confident & (lm_preds != test_preds)
print(f"LM and reporter confidently disagree on {sum(conf_disagree)} examples out of {len(test_lm_probs)}")
conf_disagree_reporter_correct = conf_disagree & is_correct
print(f"Reporter is correct on {sum(conf_disagree_reporter_correct)} out of {sum(conf_disagree)} examples where LM and reporter confidently disagree")
conf_disagree_lm_correct = conf_disagree & ~is_correct
print(f"LM is correct on {sum(conf_disagree_lm_correct)} out of {sum(conf_disagree)} examples where LM and reporter confidently disagree")

Results for /mnt/ssd-2/nora/vicuna-original-13b on akariasai/PopQA with 10 shots:
Example input:
<s>Who was the screenwriter for The Fly? John Landis

Is this true? No

Who was the producer of 19? Debra Hill

Is this true? No

What is the religion of Kumail Nanjiani? atheism

Is this true? Yes

Who is the mother of Otto von Habsburg? Sirikit

Is this true? No

Who was the screenwriter for The Fly? George Langelaan

Is this true? Yes

Who was the producer of 19? Jim Abbiss

Is this true? Yes

Who is the mother of Otto von Habsburg? Zita of Bourbon-Parma

Is this true? Yes

In what city was Rafael Reyes born? Cotija de la Paz

Is this true? Yes

In what city was Rafael Reyes born? Canaan

Is this true? No

What is the religion of Kumail Nanjiani? Episcopal Church

Is this true? No

Who was the director of Legion? Scott Stewart

Is this true?
--------------------------------------------------

LM accuracy: 0.822 ± 0.017
Train reporter accuracy (layer 21): 0.903
Reporter accuracy (layer 21

In [60]:
print("Examples where LM and reporter confidently disagree:")
print_texts(test_texts[conf_disagree])

Examples where LM and reporter confidently disagree:
What is Kuala Lumpur the capital of? Malaya

Is this true?
--------------------------------------------------
What is Prague the capital of? Czech Socialist Republic

Is this true?
--------------------------------------------------


In [61]:
# save results
from pathlib import Path
import time

prefix = time.strftime("%Y-%m-%d-%H:%M")
cache_dir = Path("./lr-experiments") / ds_name / prefix
cache_dir.mkdir(exist_ok=True, parents=True)
np.save(cache_dir / "hiddens.npy", hiddens)
np.save(cache_dir / "lm_probs.npy", lm_probs)
np.save(cache_dir / "labels.npy", labels)
np.save(cache_dir / "texts.npy", texts)