# Look for questions where the LM confidently disagrees with the reporter
- PopQA high-popularity questions might be good, but they don't have any distractors
  - We could probably just substitute in arbitrary answers from a question with the same relationship type and popularity (this is basically a custom counterfact)
- SciQ with contextual documents
- A little bit of amazon polarity
- Perhaps start with my hand-written easy dataset and ask a LM to generate more similar ones

It will make it easier to identify patterns if we keep each distribution separate at evaluation time (and maybe even during training)


In [1]:
import torch
import numpy as np
import torch
import random

seed = 633
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed);

In [2]:
# generate dataset from counterfact
from datasets import load_dataset, Dataset

# ds_name = "NeelNanda/counterfact-tracing"
ds_name = "akariasai/PopQA"
orig_dataset: Dataset = load_dataset(ds_name, split="test").shuffle(seed=seed)  # type: ignore

# dataset = dataset.map(map_fn, batched=True, batch_size=1, remove_columns=dataset.column_names)
orig_dataset

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset csv (/mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-6aec342ca94994b1.arrow


Dataset({
    features: ['id', 'subj', 'prop', 'obj', 'subj_id', 'prop_id', 'obj_id', 's_aliases', 'o_aliases', 's_uri', 'o_uri', 's_wiki_title', 'o_wiki_title', 's_pop', 'o_pop', 'question', 'possible_answers'],
    num_rows: 14267
})

In [3]:
# process the popQA dataset
# first select only the examples with s_pop in the upper 25% of the distribution
# then, for each example, find another example with the same relationship type (in the upper 25% of the distribution)
# and use this as a distractor object for that example, as a new column in the dataset

s_pop_cutoff = np.percentile(orig_dataset["s_pop"], 90)
pop_dataset = orig_dataset.filter(lambda x: x["s_pop"] >= s_pop_cutoff)

def add_distractor(example):
    distractor_candidates = pop_dataset.filter(lambda x: (x["prop_id"] == example["prop_id"]) and (x["id"] != example["id"]))
    
    try:
        distractor = np.random.choice(distractor_candidates)
        dist_obj, dist_obj_id, dist_o_pop, dist_o_aliases = distractor["obj"], distractor["obj_id"], distractor["o_pop"], distractor["o_aliases"]
    except ValueError:
        dist_obj, dist_obj_id, dist_o_pop, dist_o_aliases = "42", None, None, []
        print("No distractor found for example", example["id"], "filled with \"42\"")
    return {"dist_obj": dist_obj, "dist_obj_id": dist_obj_id, "dist_o_pop": dist_o_pop, "dist_o_aliases": dist_o_aliases}

pop_dataset = pop_dataset.map(add_distractor)
pop_dataset


Loading cached processed dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-e686124b57873689.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-32548bf0f8bbd802.arrow


Dataset({
    features: ['id', 'subj', 'prop', 'obj', 'subj_id', 'prop_id', 'obj_id', 's_aliases', 'o_aliases', 's_uri', 'o_uri', 's_wiki_title', 'o_wiki_title', 's_pop', 'o_pop', 'question', 'possible_answers', 'dist_obj', 'dist_obj_id', 'dist_o_pop', 'dist_o_aliases'],
    num_rows: 1427
})

In [4]:
# q_templates = {
#     22: "What is {}'s occupation?",
#     218: "In what city was {} born?",
#     91: "What genre is {}?",
#     257: "Who is the father of {}?",
#     182: "In what country is {}?",
#     164: "Who was the producer of {}?",
#     526: "Who was the director of {}?",
#     97: "What is {} the capital of?",
#     533: "Who was the screenwriter for {}?",
#     639: "Who was the composer of {}?",
#     472: "What color is {}?",
#     106: "What is the religion of {}?",
#     560: "What sport does {} play?",
#     484: "Who is the author of {}?",
#     292: "Who is the mother of {}?",
#     422: "What is the capital of {}?"
# }
q_templates = {
    22: "{}'s occupation is",
    218: "The city of birth of {} is",
    91: "The genre of {} is",
    257: "The father of {} is",
    182: "{} is located in the country",
    164: "The producer of {} was",
    526: "The director of {} was",
    97: "{} is the capital of",
    533: "The screenwriter for {} was",
    639: "The composer of {} was",
    472: "The color of {} is",
    106: "The religion of {} is",
    560: "The sport played by {} is",
    484: "The author of {} is",
    292: "The mother of {} is",
    422: "The capital of {} is"
}

def get_labeled_texts(example, bos_token, few_shot_prefix=None):
    """few-shot-prefix is directly prepended to the prompt, without a newline, if provided"""
    prefix = ""
    if few_shot_prefix:
        prefix = few_shot_prefix + prefix
    prop_id = example["prop_id"]
    q = q_templates[prop_id].format(example["subj"]) + " "
    prompt = bos_token + prefix + q

    suffix =  "\n\nIs this true?"
    text_true = prompt + example["obj"] + suffix
    text_false = prompt + example["dist_obj"] + suffix  # distractor object
    return {"texts": [text_true, text_false], "labels": [1, 0]}

def get_few_shot_prefix(examples):
    example_to_str = lambda example: example["text"] + " " + ("No", "Yes")[example["label"]] + "\n\n"
    example_strs = []
    for example in examples:
        exs = get_labeled_texts(example, bos_token="")
                
        # dict of list into list of dict
        exs = [dict(zip(["text", "label"], t)) for t in zip(*exs.values())]
        
        example_strs.extend(example_to_str(ex) for ex in exs)
    np.random.shuffle(example_strs)
    return "".join(example_strs)


In [5]:
n_total = 1400
# texts = np.array(dataset[:n_total]["texts"])
# labels = np.array(dataset[:n_total]["labels"])
dataset = pop_dataset.select(range(n_total))
n_shots = 5
few_shot_set = pop_dataset.select(range(n_total, n_total + 20))


In [6]:
# load a model and tokenizer huggingface's transformers library
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
# model_name = "huggyllama/llama-7b"
model_name = "gpt2-xl"
tokenizer = LlamaTokenizer.from_pretrained(model_name) if "llama" in model_name else AutoTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, device_map={"": "cuda:5"}) if "llama" in model_name \
    else AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, device_map={"": "cuda:5"})


In [7]:
from tqdm import tqdm

In [8]:
def gather_logprobs(outputs, tokenized_text):
    # returns a [n_tokens,] numpy array of logprobs
    logprobs = outputs["logits"].log_softmax(dim=-1)
    return torch.gather(logprobs, 2, tokenized_text.input_ids.unsqueeze(2)).squeeze(2).squeeze(0)

def get_hiddens(dataset, few_shot_set):
    # run the model and get the hidden states at each layer
    # creates a len(texts) x n_layers x hidden_dimension array of zeros
    n_layer = 32 if model_name == "huggyllama/llama-7b" else model.config.n_layer
    hidden_size = model.config.hidden_size
    hiddens = np.zeros((2 * len(dataset), n_layer + 1, hidden_size))
    lm_probs = np.zeros((2 * len(dataset),))
    texts = np.zeros((2 * len(dataset),), dtype=object)
    labels = np.zeros((2 * len(dataset),), dtype=int)
    i = 0
    with torch.no_grad():
        for example in tqdm(dataset, total=n_total):
            few_shot_prefix = get_few_shot_prefix(few_shot_set.shuffle(seed=seed).select(range(n_shots))) if n_shots > 0 else None
            labeled_texts = get_labeled_texts(example, tokenizer.bos_token, few_shot_prefix=few_shot_prefix)
            for text, label in zip(labeled_texts["texts"], labeled_texts["labels"]):
                tokenized_text = tokenizer(text, return_tensors="pt").to("cuda")
                outputs = model(**tokenized_text, output_hidden_states=True)
                
                hidden_states = outputs["hidden_states"]  # a tuple of torch tensors, one for each layer
                hiddens[i, :, :] = torch.cat(hidden_states).cpu().numpy()[:, -1]  # all layers, last token

                # # get total logprob assigned to each string
                # logprobs_true = gather_logprobs(outputs_true, tokenized_text_true).cpu().numpy()
                # logprobs_false = gather_logprobs(outputs_false, tokenized_text_false).cpu().numpy()
                # # p_true, p_false = np.exp(logprobs_true.sum()), np.exp(logprobs_false.sum())
                # # lm_preds[2 * i] = p_true / (p_true + p_false)
                # # lm_preds[2 * i + 1] = p_false / (p_true + p_false)
                # # the above is unstable, so we do the following instead
                # logaddexp = np.logaddexp(logprobs_true.sum(), logprobs_false.sum())
                # lm_probs[2 * i] = np.exp(logprobs_true.sum() - logaddexp)
                # lm_probs[2 * i + 1] = np.exp(logprobs_false.sum() - logaddexp)
                
                # get prob assigned to each target
                no_id, yes_id = tokenizer.convert_tokens_to_ids([" No", " Yes"])
                p_no, p_yes = outputs["logits"][0, -1, [no_id, yes_id]].softmax(dim=-1).cpu().numpy()
                
                lm_probs[i] = p_no / (p_yes + p_no)  # probability of "no" (no error)
                labels[i] = label
                texts[i] = text
                i += 1
    return hiddens, lm_probs, labels, texts
hiddens, lm_probs, labels, texts = get_hiddens(dataset, few_shot_set)

  0%|          | 0/1400 [00:00<?, ?it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  0%|          | 1/1400 [00:01<38:41,  1.66s/it]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  0%|          | 2/1400 [00:01<19:21,  1.20it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-edc3b374124de1d3.arrow
  0%|          | 3/1400 [00:02<13:21,  1.74it/s]Loading cached shuffled indices for dataset at /mnt/ssd-2/hf_cache/akariasai___csv/akariasai--PopQA-f60940326e75cf5e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325

In [9]:
model.config.hidden_size

1600

In [10]:
# make a train/test split and keep them separate
shuffled_idxs = np.random.permutation(np.arange(len(hiddens)))
shuffled_hiddens = hiddens[shuffled_idxs]
shuffled_labels = labels[shuffled_idxs]
train_size = int(len(shuffled_hiddens) * 0.7)
train_hiddens = shuffled_hiddens[:train_size]
test_hiddens = shuffled_hiddens[train_size:]
train_labels = shuffled_labels[:train_size]
test_labels = shuffled_labels[train_size:]
train_texts = texts[shuffled_idxs][:train_size]
test_texts = texts[shuffled_idxs][train_size:]
test_lm_probs = lm_probs[shuffled_idxs][train_size:]

# train a classifier on the hidden states
from sklearn.linear_model import LogisticRegressionCV
# use cross-validation to find the best hyperparameters
# use the best hyperparameters to train a final model
Cs = 10 ** np.linspace(-5, 5, 11)
n_layer = 32 if model_name == "huggyllama/llama-7b" else model.config.n_layer
layer = n_layer // 2 + 1  # the layer to use for classification, somewhat arbitrary but middle layers work better
reporter = LogisticRegressionCV(Cs=Cs, cv=2).fit(train_hiddens[:, layer, :], train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [11]:
# get reporter regularization parameters
print("best regularization parameter:", reporter.C_[0])
# get model predictions on the test set
test_preds = reporter.predict(test_hiddens[:, layer, :])

is_correct = test_preds == test_labels
acc = np.mean(is_correct)
stderr = 0.5 / np.sqrt(len(test_labels))  # SE_prop = sqrt(p(1-p)/n)
print(f"Accuracy: {acc:.3f} ± {stderr:.3f}")
correct_examples = test_texts[is_correct]
incorrect_examples = test_texts[~is_correct]

# train acc
train_preds = reporter.predict(train_hiddens[:, layer, :])
train_acc = np.mean(train_preds == train_labels)
print(f"Train accuracy: {train_acc:.3f}")

# analyze these examples to see what the reporter is getting right and wrong...

best regularization parameter: 0.01
Accuracy: 0.577 ± 0.017
Train accuracy: 0.756


In [12]:
sum(lm_probs > 0.5) / len(lm_probs)

0.0

In [13]:
# lm accuracy
cal_thresh = np.quantile(test_lm_probs, test_labels.mean())  # the LM usually just guesses "no error", so we calibrate it to the true proportion of "no error" examples
lm_preds = test_lm_probs > cal_thresh
lm_acc = np.mean(lm_preds == test_labels)
lm_stderr = 0.5 / np.sqrt(len(test_labels))  # SE_prop = sqrt(p(1-p)/n).
print(f"LM Accuracy: {lm_acc:.3f} ± {lm_stderr:.3f}")

LM Accuracy: 0.498 ± 0.017


In [14]:
# where do they disagree? Make a confusion matrix
lm_correct = (test_lm_probs > 0.5) == test_labels
reporter_correct = test_preds == test_labels

lm_better = lm_correct & ~reporter_correct
reporter_better = ~lm_correct & reporter_correct
unequal = lm_correct != reporter_correct
print(f"The LM is better on {lm_better.sum()} examples")
print(f"The reporter is better on {reporter_better.sum()} examples")
print(f"The LM and reporter disagree on {unequal.sum()} examples")

The LM is better on 166 examples
The reporter is better on 232 examples
The LM and reporter disagree on 398 examples


In [15]:
# save results
from pathlib import Path
import time

prefix = time.strftime("%Y-%m-%d-%H:%M")
cache_dir = Path("./lr-experiments") / ds_name / prefix
cache_dir.mkdir(exist_ok=True, parents=True)
np.save(cache_dir / "hiddens.npy", hiddens)
np.save(cache_dir / "lm_probs.npy", lm_probs)
np.save(cache_dir / "labels.npy", labels)
np.save(cache_dir / "texts.npy", texts)

In [16]:
test_texts[~unequal]

array(['<|endoftext|>The city of birth of Rafael Reyes is Cotija de la Paz\n\nIs this true? Yes\n\nThe screenwriter for The Fly was John Landis\n\nIs this true? No\n\nThe religion of Kumail Nanjiani is Episcopal Church\n\nIs this true? No\n\nThe producer of 19 was Debra Hill\n\nIs this true? No\n\nThe city of birth of Rafael Reyes is Canaan\n\nIs this true? No\n\nThe mother of Otto von Habsburg is Zita of Bourbon-Parma\n\nIs this true? Yes\n\nThe screenwriter for The Fly was George Langelaan\n\nIs this true? Yes\n\nThe mother of Otto von Habsburg is Sirikit\n\nIs this true? No\n\nThe religion of Kumail Nanjiani is atheism\n\nIs this true? Yes\n\nThe producer of 19 was Jim Abbiss\n\nIs this true? Yes\n\nThe producer of Crash was Joss Whedon\n\nIs this true?',
       '<|endoftext|>The mother of Otto von Habsburg is Sirikit\n\nIs this true? No\n\nThe mother of Otto von Habsburg is Zita of Bourbon-Parma\n\nIs this true? Yes\n\nThe religion of Kumail Nanjiani is Episcopal Church\n\nIs this 

In [17]:
test_texts[lm_better]

array(['<|endoftext|>The screenwriter for The Fly was George Langelaan\n\nIs this true? Yes\n\nThe screenwriter for The Fly was John Landis\n\nIs this true? No\n\nThe mother of Otto von Habsburg is Zita of Bourbon-Parma\n\nIs this true? Yes\n\nThe religion of Kumail Nanjiani is Episcopal Church\n\nIs this true? No\n\nThe mother of Otto von Habsburg is Sirikit\n\nIs this true? No\n\nThe producer of 19 was Debra Hill\n\nIs this true? No\n\nThe religion of Kumail Nanjiani is atheism\n\nIs this true? Yes\n\nThe city of birth of Rafael Reyes is Cotija de la Paz\n\nIs this true? Yes\n\nThe producer of 19 was Jim Abbiss\n\nIs this true? Yes\n\nThe city of birth of Rafael Reyes is Canaan\n\nIs this true? No\n\nThe mother of Queen Victoria is Jane Randolph Jefferson\n\nIs this true?',
       '<|endoftext|>The mother of Otto von Habsburg is Sirikit\n\nIs this true? No\n\nThe religion of Kumail Nanjiani is atheism\n\nIs this true? Yes\n\nThe screenwriter for The Fly was John Landis\n\nIs this tru