# Load a reporter and model, and then do truthfulness highlighting on arbitrary text

In [60]:
import torch
import numpy as np
import random

seed = 633
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed);

In [61]:
from IPython.display import display, HTML
from pathlib import Path
import yaml

reporter_dir = Path("/mnt/ssd-2/spar/alexm/elk/facebook/opt-6.7b/atmallen/facts_azaria_mitchell+atmallen/companies_azaria_mitchell+atmallen/cities_azaria_mitchell+atmallen/animals_azaria_mitchell+atmallen/inventions_azaria_mitchell+atmallen/elements_azaria_mitchell/gracious-kirch")
# reporter_dir = Path("/mnt/ssd-2/spar/alexm/elk/facebook/opt-6.7b/atmallen/facts_azaria_mitchell+atmallen/companies_azaria_mitchell+atmallen/cities_azaria_mitchell+atmallen/animals_azaria_mitchell+atmallen/inventions_azaria_mitchell+atmallen/elements_azaria_mitchell/gracious-kirch/transfer/atmallen/neg_companies_azaria_mitchell+atmallen/neg_facts_azaria_mitchell")
device = "cuda:6"

cfg_path = reporter_dir / "cfg.yaml"
with open(cfg_path) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)

fingerprints_path = reporter_dir / "fingerprints.yaml"
with open(fingerprints_path) as f:
    fingerprints = yaml.load(f, Loader=yaml.FullLoader)

model_name = cfg["data"]["model"]
model_name

'facebook/opt-6.7b'

In [67]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk, Features, Value, load_dataset, Array2D, Array3D, Array4D
import torch
import os
import pickle
from tqdm import tqdm
# run ./custom-datasets/truthful-qa through gpt2-xl, extract the hiddens, and use a VINC model

def extract_hiddens(model, tokenizer, dataset, layers=None, batch_size=1, max_examples=500):
    """Extract the hiddens from a model for a given dataset.
    Dataset must have 'statement' column."""
    model.eval()
    layers = layers or list(range(model.config.num_hidden_layers))
    dataset = dataset.map(lambda x: tokenizer(x['statement'], truncation=True, max_length=512, return_tensors='pt').to(model.device), batched=False)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'], device=model.device)
    dataset = dataset.select(range(min(max_examples, len(dataset))))

    def unbatched_map(example, token_loc=-1):
        with torch.no_grad():
            outputs = model(input_ids=example['input_ids'], attention_mask=example['attention_mask'])
        h = [outputs.hidden_states[i][0, token_loc, :] for i in layers]  # type: ignore
        hiddens = torch.stack(h, dim=0)  # [num_layers, hidden_size]
        logits = outputs.logits
        return {'hiddens': hiddens, 'logits': logits}

    # features = Features({
    #     'hiddens': Array4D(dtype='float32', shape=(len(layers), batch_size, 512, model.config.hidden_size)),
    #     'logits': Array3D(dtype='float32', shape=(batch_size, 512, model.config.vocab_size)),
    # })
    new_ds = dataset.map(unbatched_map, batched=False, remove_columns=['input_ids', 'attention_mask'])  # type: ignore
    
    return new_ds


def extract_hiddens_and_save(model, tokenizer, dataset, output_file, layers=None):
    """Extract the hiddens from a model for a given dataset and save them to a file."""
    hiddens_ds = extract_hiddens(model, tokenizer, dataset, layers)
    try:
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        hiddens_ds.save_to_disk(output_file)
    except Exception as e:
        print(f"Failed to save to {output_file}: {e}")
    return hiddens_ds

# Detect outliers from the ELK probe training distribution using Mahalanobis distance-based classifier

In [63]:
azaria_mitchell_datasets = ['atmallen/animals_azaria_mitchell', 'atmallen/cities_azaria_mitchell', 'atmallen/companies_azaria_mitchell', 'atmallen/elements_azaria_mitchell', 'atmallen/facts_azaria_mitchell', 'atmallen/inventions_azaria_mitchell']
ds_name = "atmallen/all6_azaria_mitchell"
ds = load_dataset(ds_name)

Found cached dataset parquet (/mnt/ssd-2/hf_cache/atmallen___parquet/atmallen--all6_azaria_mitchell-e248b2a557bf0561/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 1023.50it/s]


In [64]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "gpt2"
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name, output_hidden_states=True).to(device)

In [65]:
output_path = f"{ds_name}_hiddens"
hiddens_ds = extract_hiddens_and_save(model, tokenizer, ds["train"], output_path, layers=None)

Loading cached processed dataset at /mnt/ssd-2/hf_cache/atmallen___parquet/atmallen--all6_azaria_mitchell-e248b2a557bf0561/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-ec157a45fc9b30fd.arrow
Loading cached processed dataset at /mnt/ssd-2/hf_cache/atmallen___parquet/atmallen--all6_azaria_mitchell-e248b2a557bf0561/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-3cf074d1eb5ce771.arrow


Failed to save to atmallen/all6_azaria_mitchell_hiddens: Object of type device is not JSON serializable
The format kwargs must be JSON serializable, but key 'device' isn't.


In [87]:
ds["train"][1000]["statement"]

"Qa'em Shahr is a name of a country."

In [91]:
from datasets import Dataset
statements = ["The penguin does not have a diet of carnivore.", "The enderman has a diet of carnivore.", "The president of the United States is George Washington.", 
              "The penguin has a diet of carnivore?", "The duck swims.", "My mom told me that Qa'em Shahr is a name of a country."]
test_dataset = Dataset.from_dict({"statement": statements})
output_path = "rando"
test_hiddens_ds = extract_hiddens_and_save(model, tokenizer, test_dataset, output_path, layers=None)

                                                 

Failed to save to rando: [Errno 2] No such file or directory: ''




In [92]:
layer = 6
base_mat = hiddens_ds["hiddens"][:, layer, :].cpu()
test_mat = test_hiddens_ds["hiddens"][:, layer, :].cpu()
base_mat.shape, test_mat.shape

(torch.Size([500, 768]), torch.Size([6, 768]))

In [99]:
mahalanobis_detector(test_mat, base_mat, use_linear_shrinkage=False, explained_variance_thresh=0.9)

array([9.93397552e-01, 9.76659176e-01, 2.09404284e-07, 5.29426503e-10,
       2.81378243e-02, 4.31806017e-02])

In [3]:
from utils import load_model_and_tokenizer

# model_name = "huggyllama/llama-7b"
# model_name = "gpt2-xl"
# model_name = "/mnt/ssd-2/nora/vicuna-original-13b"
# model_name = "huggyllama/llama-13b"
is_llama = "llama" in model_name or "vicuna" in model_name
model, tokenizer = load_model_and_tokenizer(model_name, is_llama=is_llama, device=device)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.72s/it]


In [6]:
from utils import call_model

def get_hiddens(text: str):
    # run the model and get the hidden states at each layer
    
    # encode the text
    encodings = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
    num_tokens = encodings.input_ids.shape[1]

    n_layer = model.config.num_hidden_layers
    hidden_size = model.config.hidden_size
    tokens = tokenizer.convert_ids_to_tokens(encodings.input_ids[0])

    with torch.no_grad():
        hidden_states, logits = call_model(model, tokenizer, text)

        hiddens = torch.cat(hidden_states)
        hiddens = torch.transpose(hiddens, 1, 0)  # shape (n_tokens, n_layer, hidden_size)
    return hiddens, tokens


tensor(0.8055, device='cuda:6', grad_fn=<StdBackward0>)
custom-models/pythia-6.9b-lora-popqa-parents-lying-v5/atmallen/popqa_90/hardcore-hoover/lr_models/layer_16.pt


In [33]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

def rgba_to_hex(rgba_color):
    r, g, b, a = rgba_color
    return "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255))

cmap_name = "PiYG"

cmap = cm.get_cmap(cmap_name)
color = rgba_to_hex(cmap(0.5))
print(color)



#f6f6f6


  cmap = cm.get_cmap(cmap_name)


In [57]:
def highlight_words_with_colors(tokens, colors):
    if len(colors) != len(tokens):
        raise ValueError("The number of colors should match the number of words.")
    
    highlighted_text = ''.join(f'<span style="color:blue; background-color: {colors[i]};">{tokens[i]}</span>' for i in range(len(tokens)))
    display(HTML(highlighted_text))

In [133]:
def highlight_with_elk(text, use_lr=True, layer=10):
    hiddens, tokens = get_hiddens(text)

    num_layers = hiddens.shape[1]
    if use_lr:
        reporter_path = reporter_dir / f"lr_models/layer_{layer}.pt"
        reporter = torch.load(reporter_path, map_location=device)[0]
        # print(reporter.linear.weight.std())
    else:
        reporter_path = reporter_dir / f"reporters/layer_{layer}.pt"
        reporter = torch.load(reporter_path, map_location=device)
        # print(reporter.weight.std())
    # print(reporter_path)


    tokens = [tok.replace("Ġ", " ").replace("Ċ", "\n") for tok in tokens]
    # print(hiddens.shape, tokens)

    elk_scores = np.empty((len(tokens), num_layers))
    for i in range(len(tokens)):
        for j in range(num_layers):
            h = hiddens[i, j]
            elk_score = torch.sigmoid(reporter(h.float()))
            elk_scores[i, j] = elk_score
            
    max_score = elk_scores[:, layer].max()
    min_score = elk_scores[:, layer].min()
    # print(min_score, max_score)
    colors = [rgba_to_hex(cmap(((e - min_score) / (max_score - min_score)))) for e in elk_scores[:, layer]]
    # print(colors)

    highlight_words_with_colors(tokens, colors)

In [142]:
highlight_with_elk("""What is the occupation of Joe Biden? Politician. Indeed """, use_lr=True, layer=16)

In [141]:
highlight_with_elk("Mom, is 5 is an even number? Yes.", use_lr=True, layer=15)

In [67]:
def generate(question):
    inputs = tokenizer(question, return_tensors="pt").to(device)
    # generate from model
    outputs = model.generate(**inputs, max_length=len(inputs["input_ids"][0]) + 20, do_sample=False, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return pred

In [68]:
pred = generate("""<s>Chrysler 200, developed by Nokia

Is this true? No

Layar is headquartered in Amsterdam

Is this true? Yes

The capital city of Baghdad Governorate is Tripoli

Is this true? No

Jon Larsen plays opera

Is this true? No

Chrysler 200, developed by Chrysler

Is this true? Yes

The official language of Italy is Korean

Is this true? No

Jon Larsen plays jazz

Is this true? Yes

The capital city of Baghdad Governorate is Baghdad

Is this true? Yes

Layar is headquartered in Canberra

Is this true? No

Peshtera Glacier is located in Antarctica.

Is this true? According to my knowledge,""")
print(pred)

<s>Chrysler 200, developed by Nokia

Is this true? No

Layar is headquartered in Amsterdam

Is this true? Yes

The capital city of Baghdad Governorate is Tripoli

Is this true? No

Jon Larsen plays opera

Is this true? No

Chrysler 200, developed by Chrysler

Is this true? Yes

The official language of Italy is Korean

Is this true? No

Jon Larsen plays jazz

Is this true? Yes

The capital city of Baghdad Governorate is Baghdad

Is this true? Yes

Layar is headquartered in Canberra

Is this true? No

Peshtera Glacier is located in Antarctica.

Is this true? According to my knowledge, No
No
No
No
No
No
No
No
No
No



# Mahalanobis

In [52]:
hiddens_mat = hiddens_ds["hiddens"]
# hiddens_mat = hiddens_mat.reshape((hiddens_mat.shape[0], -1)).cpu()
hiddens_mat = hiddens_mat[:, 6, :].cpu()
hiddens_mat.shape

torch.Size([500, 768])

In [53]:
# define a simple mahalanobis distance-based outlier detection method
from scipy.stats import chi2
from concept_erasure import optimal_linear_shrinkage
from elk.utils import int16_to_float32
from scipy.spatial.distance import mahalanobis


def mahalanobis_detector(x, base_dstr, use_linear_shrinkage=False, explained_variance_thresh=0.9):
    """
    x: (batch, d) example to do inference on
    base_dstr: (n, d) base distribution
    use_linear_shrinkage: whether to use optimal linear shrinkage to estimate the covariance matrix
    explained_variance_thresh: threshold for the percentage of explained variance
        of the covariance matrix to use. Only the span of the top principal components is considered.
    """
    dist, n_components = mahalanobis_dist(x, base_dstr, use_linear_shrinkage, explained_variance_thresh)
    p_val = 1 - mahal_cdf(dist, n_components)
    return p_val

def mahal_cdf(z, n):
    # https://en.wikipedia.org/wiki/Mahalanobis_distance
    # https://en.wikipedia.org/wiki/Chi-squared_distribution
    # https://en.wikipedia.org/wiki/Chi-squared_distribution#Cumulative_distribution_function
    return chi2.cdf(z**2, n)

def mahalanobis_dist(x, base_dstr, use_linear_shrinkage=False, explained_variance_thresh=0.9):
    """
    x: (batch, d) example to do inference on
    base_dstr: (n, d) base distribution
    use_linear_shrinkage: whether to use optimal linear shrinkage to estimate the covariance matrix
    explained_variance_thresh: threshold for the percentage of explained variance
        of the covariance matrix to use. Only the span of the top principal components is considered.
    """
    n = base_dstr.shape[0]
    base_dstr_ctrd = base_dstr - base_dstr.mean(axis=0, keepdims=True)
    cov = base_dstr_ctrd.T @ base_dstr_ctrd / n
    if use_linear_shrinkage:
        cov = optimal_linear_shrinkage(cov, n)
    eigvals, eigvecs = np.linalg.eigh(cov)
    # argsort in descending order
    idxs = np.argsort(eigvals)[::-1]
    eigvals = eigvals[idxs]
    eigvecs = eigvecs[:, idxs]

    if explained_variance_thresh == 1:
        # use all principal components
        n_components = eigvals.shape[0]
        dist = np.array([
            mahalanobis(x[i], base_dstr.mean(axis=0), np.linalg.inv(cov))
            for i in range(x.shape[0])
        ])
        return dist, n_components
        
    eigvals_sum = eigvals.sum()
    eigvals_cumsum = eigvals.cumsum()
    # find the number of principal components that explain at least `explained_variance_thresh` of the variance
    n_components = np.searchsorted(eigvals_cumsum, explained_variance_thresh * eigvals_sum)

    # project the example onto the span of the top principal components
    x_ctrd = x - base_dstr.mean(axis=0, keepdims=True)

    #          (batch, d) @ (d, n_components) -> (batch, n_components)
    x_proj = x_ctrd @ eigvecs[:, :n_components]
    dist = np.linalg.norm(x_proj / np.sqrt(eigvals[:n_components]), axis=1)
    return dist, n_components  # (batch,)

In [59]:
dist = mahalanobis_detector(hiddens_mat[:6], hiddens_mat, use_linear_shrinkage=False, explained_variance_thresh=0.9)
dist

array([0.9886938 , 0.97939276, 0.27778547, 0.9770254 , 0.32209809,
       0.99507063])

In [46]:
# dist, n_components = dist
p_val = 1 - mahal_cdf(dist, hiddens_ds.shape)
p_val


array([0.99192014, 0.93608072, 0.0402877 , 0.94630088, 0.0011091 ,
       0.73255266])

In [43]:


mahal_cdf(np.array([8, 7]), 55), mahal_cdf(23, hiddens_mat.shape[0])

(array([0.81006118, 0.29802167]), 0.8214463808229575)