# Setup

In [None]:
%pip install transformers
%pip install datasets

In [None]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import sklearn
import dataclasses
import einops
from CCS import CCS

In [None]:
from datasets import load_dataset
tqamc = load_dataset("truthful_qa", "multiple_choice")['validation']

# from datasets import load_dataset
# dataset = load_dataset("boolq", split='train')
# print(dataset[4])

# def boolq_to_prompt(data, FLAG=1): 
#   if FLAG == 1: 
#     return "Passage: " + data['passage'] + "\n\nAfter reading this passage, I have a question: " + data['question'] + "? Yes or no?"

# prompt = boolq_to_prompt(dataset[0])
# print(prompt)

### Loading and testing model

In [None]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
config = GPT2Config.from_pretrained("gpt2-medium", output_hidden_states=True)
model = GPT2LMHeadModel.from_pretrained("gpt2-medium", config=config) 

In [None]:
def generate(prompt, max_length = 40, do_sample=True, top_p = 0.95, top_k =60, **model_kwargs): 
    inputs = tokenizer(prompt, return_tensors="pt")["input_ids"]
    output_sequences = model.generate(inputs,max_length=max_length, 
                                      do_sample=do_sample,top_p=top_p,
                                      top_k=top_k,**model_kwargs)
    return tokenizer.decode(output_sequences[0], skip_special_tokens=True)

out = generate(tqamc['question'][5] + " " + tqamc['mc2_targets'][5]['choices'][3])
print(out)

In [None]:
# RESULT = 0.47869747411559416
# inline with the truthfulQA paper result

# corrects = []
# for i in tqdm(range(len(tqamc['validation']))):
#   prompt = tqamc['validation'][i]['question'] + " "
#   mc2_dict = tqamc['validation'][i]['mc2_targets']
#   corrects.append(normalized_correct_answers(prompt, mc2_dict))

# print(sum(corrects)/len(corrects))

### Preparing dataset

In [None]:
def split_tqa_mc2(tqamc, frac, start=True): 
    """
    TruthfulQA MC2 has multiple correct answers per question. Want to split into
    multiple copies of the question with different answers so there's only two
    options per datapoint
    """
    dataset = {'question': [], 'choices': [], 'labels': []}
    dlen = int(frac*len(tqamc['question']))
    for i in tqdm(range(dlen)): 
        ncopies = len(tqamc['mc2_targets'][i]['labels'])//2
        for l in range(ncopies): 
            dataset['question'].append(tqamc['question'][i])
            dataset['choices'].append([tqamc['mc2_targets'][i]['choices'][l], 
                                      tqamc['mc2_targets'][i]['choices'][l+ncopies]])
            dataset['labels'].append([tqamc['mc2_targets'][i]['labels'][l], 
                                      tqamc['mc2_targets'][i]['labels'][l+ncopies]])
    return dataset

dataset = split_tqa_mc2(tqamc, 0.075)

In [None]:
test = {'question': dataset['question'][100:], 'choices': dataset['choices'][100:],
        'labels': dataset['labels'][100:]}
train = {'question': dataset['question'][:100], 'choices': dataset['choices'][:100],
         'labels': dataset['labels'][:100]}

### Gathering activations

In [None]:
def get_activations(model, tokenizer, dataset, layers, frac): 
    """
    Takes in truthfulQA MC2 dataset (after split from above) and returns a 
    (len_dataset, 2, len_activations_at_layer * len(layers)) tensor and a 
    (len_dataset, 1) tensor of labels
    """
    dlen = int(frac*len(dataset['question']))
    inputs = [[tokenizer(text, return_tensors="pt")["input_ids"] for text in 
                 [dataset['question'][i] + " " + dataset['choices'][i][0],  
                 dataset['question'][i] + " " + dataset['choices'][i][1]]] 
                 for i in range(dlen)]
    # get activations at each layer
    activations = t.stack([t.stack([t.stack([model(i)[2][layer][0,-1]for layer in layers]) for i in input]) for input in tqdm(inputs)])
    activations = einops.rearrange(activations, 'dlen nchoices len_layers len_activations -> dlen nchoices (len_layers len_activations)')
    labels = t.zeros(dlen, 1)
    return t.tensor(activations), labels

def get_one_activation(model, tokenizer, prompt, layer): 
    """
    Outputs activations at hidden_layer layer of model on the last token of prompt
    """
    input = tokenizer(prompt, return_tensors="pt")["input_ids"]
    return model(input)[2][layer][0,-1]

In [None]:
activations, y = get_activations(model, tokenizer, train, [6], 1)
# concat on dim 2
print("")
print(activations.shape)
print(y.shape)

### Train CCS probe

In [None]:
x0 = activations[:,0,:].detach()
x0 = (x0 - x0.mean(axis=0, keepdims=True))/x0.std(axis=0, keepdims=True)
x1 = activations[:,1,:].detach()
x1 = (x1 - x1.mean(axis=0, keepdims=True))/x1.std(axis=0, keepdims=True)

ccs = CCS(x0, x1, y)
print("\n", ccs.train())
print(ccs.flag)

In [None]:
# test clasifier on test set
test_acts, y_test = get_activations(model, tokenizer, test, [6], 1)
# normalize with training set mean and std
xtest0 = test_acts[:,0,:].detach()
xtest0 = (xtest0 - x0.mean(axis=0, keepdims=True))/x0.std(axis=0, keepdims=True)
xtest1 = test_acts[:,1,:].detach()
xtest1 = (xtest1 - x1.mean(axis=0, keepdims=True))/x1.std(axis=0, keepdims=True)
print("\n", ccs.pred_acc(xtest0, xtest1, y_test))

In [None]:
def view_CCS_pred(data_index, ccs): 
    """
    View CCS predictions for both answer choices for dataset[data_index]
    """
    q = train['question'][data_index] + " "
    prmpt1 = q + train['choices'][data_index][0]
    prmpt2 = q + train['choices'][data_index][1]
    x0 = get_one_activation(model, tokenizer, prmpt1, 8).unsqueeze(0).detach()
    x1 = get_one_activation(model, tokenizer, prmpt2, 8).unsqueeze(0).detach()
    y = t.Tensor([0])
    print(prmpt1)
    print(prmpt2)
    print(ccs.probe(x0))
    print(ccs.probe(x1))
    print(ccs.pred_acc(x0, x1, y))

for i in range(50): 
    view_CCS_pred(i, ccs)

# current weird things
# 1) ties get resolved to successful predictions lol FIXED
# 2) weird incentive to be (1,1) or (0,0)
# 3) the accuracy for pred_acc is opposite for some reason FIXED

### Test PCA

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2)
acts = t.cat((x0, x1), dim=0)
pca.fit(acts)

x0_pca = pca.transform(x0)
x1_pca = pca.transform(x1)

plt.scatter(x0_pca[:,0], x0_pca[:,1], c='r')
plt.scatter(x1_pca[:,0], x1_pca[:,1], c='b')
plt.show()

### Test zero-shot performance

In [None]:
# RESULT:  0.46790703848183607
# inline with the truthfulQA paper

# test zero shot performance (by comparing relative logit value)
def get_ll(prompt, text): 
    input_ids = tokenizer.encode(prompt + text, return_tensors='pt')
    logits = model(input_ids)[0]
    # get ll of last token
    return logits[0, -1, tokenizer.encode(text)[-1]].item()

def normalized_prob(prompt, choices): 
    lls = np.array([get_ll(prompt, choice) for choice in choices])
    # turn into normalized probabilities that sum to 1
    probs = np.exp(lls - lls.max())
    return probs / probs.sum()

dlen = len(test['question'])
probs = []
for i in tqdm(range(dlen)):
    prompt = test['question'][i] + " "
    choices = test['choices'][i]
    probs.append(normalized_prob(prompt, choices)[0])

print("\n",sum(probs)/len(probs))

In [None]:
# Experiments to run
## fine-tuning on TruthfulQA with CCS head output
## hold the first model frozen and only finetune a second model
## creating a wrapper model to get better performance on truthfulQA if fine-tuning fails (basically whats happening now)