In [1]:
import json
import os
import argparse
import torch
from tqdm import tqdm
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.nn import CrossEntropyLoss
from tuned_lens import TunedLens
from _config import HUFFINGFACE_KEY

from prettytable import PrettyTable

##NNSIGHT

import nnsight
from nnsight import NNsight, LanguageModel


##TRANSFORMER-LENS

# import transformer_lens.utils as utils
# from transformer_lens.hook_points import (
#     HookPoint,
# )  # Hooking utilities
from transformer_lens import HookedTransformer


# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# parser = argparse.ArgumentParser()
model_name = 'gpt2'
batchsize = 4
quantize = ''
data = 'DC'
trial = True
method = 'logit-lens'
cache = os.path.expanduser('./tmp/cashe/')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
article2tokens = json.load(open(f"data/{data}/tokens.json"))
loss_fct = CrossEntropyLoss(ignore_index=-100, reduction="none")

access_token = HUFFINGFACE_KEY
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache)
tokenizer.pad_token = tokenizer.eos_token

path = f"results/{method}/{data}/{os.path.basename(model_name)}"
os.makedirs(path, exist_ok=True)

if quantize == "4bit":
    hf_model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token, load_in_4bit=True,cache_dir=cache)
    hf_model.eval()
elif quantize == "8bit":
    hf_model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token, load_in_8bit=True,cache_dir=cache)
    hf_model.eval()
else:
    hf_model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token, cache_dir=cache)
    hf_model.to(device).eval()

if "gpt" in model_name or "falcon" in model_name:
    last_ln = hf_model.transformer.ln_f
    lm_head = hf_model.lm_head
elif "opt" in model_name:
    last_ln = hf_model.model.decoder.final_layer_norm  # changed
    lm_head = hf_model.lm_head
elif "xglm" in model_name:
    last_ln = hf_model.model.layer_norm
    lm_head = hf_model.lm_head
elif "pythia" in model_name:
    last_ln = hf_model.gpt_neox.final_layer_norm
    lm_head = hf_model.embed_out
else:
    raise ValueError("model name must contain 'gpt,' 'opt,' or 'falcon'")

loss_fct = CrossEntropyLoss(ignore_index=-100, reduction="none")
bos_string = tokenizer.decode(hf_model.config.bos_token_id)
article2surprisals = defaultdict(lambda: defaultdict(list))
# article2entropies = defaultdict(lambda: defaultdict(list))
# article2renyi_entropies = defaultdict(lambda: defaultdict(list))

if trial:
    article2tokens = {k: v for k, v in list(article2tokens.items())[:1]}

eps = 1e-8
if method == "tuned-lens":
    tuned_lens = TunedLens.from_model_and_pretrained(hf_model).to(hf_model.device)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Define helper functions

def def_trans_lens_model(model_name):
    return HookedTransformer.from_pretrained(model_name, device=device).to(device)

def def_nnsight_wrap(model):
    nn_model = NNsight(model)
    layers = nn_model.transformer.h
    return nn_model,layers

def get_nn_logits(model, layers, sents):
    '''Get output from all layers in nnsight'''
    nn_outputs = []
    with model.trace() as tracer:
        with tracer.invoke(sents) as invoker:
            # hidden layers: token embeddings + 12 transformer outputs

            # get token embedding
            token_embd = model.transformer.wte.output + model.transformer.wpe.output
            nn_outputs.append(model.lm_head(token_embd).save())
            # get transformer block embeddings
            for layer_id, layer in enumerate(layers):
                if layer_id == len(layers) - 1:
                    nn_outputs.append(model.lm_head(model.transformer.ln_f(layer.output[0])).save())
                else:
                    nn_outputs.append(model.lm_head(layer.output[0]).save())
    return torch.stack(nn_outputs).to(device)

def get_base_logits(model, sents):
    output = hf_model(sents, output_hidden_states=True)
    return lm_head(torch.stack(output[2]).to(device))

def get_trans_lens_logits(model, sents):
    # tokens = model.to_tokens(prompt, prepend_bos=False)
    logits, cache = model.run_with_cache(sents, remove_batch_dim=False)
    embed = model.embed(sents) + model.pos_embed(sents)
    # if model.cfg.final_rms:
    #     embed = model.ln_final(embed)

    layer_logits = [model.unembed(embed)]
    for layer in range(model.cfg.n_layers):
        resid = cache["resid_post", layer]
        if layer == model.cfg.n_layers - 1 and model.cfg.final_rms:
            resid = model.ln_final(resid)
        # layer_logits.append(model.unembed(resid))

        logits_at_layer = model.unembed(resid)
        layer_logits.append(logits_at_layer)
    return torch.stack(layer_logits).to(device)

def layer_by_layer_comp(logits1, logits2):
    # assert logits1.shape != logits2.shape
    table = PrettyTable()
    table.align = 'l'
    table.field_names = ["layer", "allclose"]
    for i in range(len(logits1)):
        table.add_row([i,torch.allclose(logits1[i], logits2[i])])
    print(table)

def get_predicted(logit):
    return tokenizer.decode(torch.argmax(logit, dim=-1)[0][-1])

In [4]:
# prompt = "The London Bridge is in the city of"
prompt = "The Eiffel Tower is in the city of"

encoded_sents = tokenizer(prompt, return_tensors="pt", padding=True, add_special_tokens=False)["input_ids"].to(device)
## nnsight

nn_model, layers1 = def_nnsight_wrap(hf_model)
tl_model = def_trans_lens_model(model_name)


Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cpu


In [58]:
# lm_head(reps[-1]) is just the same as base_outputs.logits
# layer_logit = lm_head(reps[-1])
#
# print(layer_logit.shape, base_outputs.logits.shape)
# print(torch.allclose(layer_logit, base_outputs.logits))

torch.Size([1, 10, 50257]) torch.Size([1, 10, 50257])
True


In [4]:
# Comparing plain vs encoded as input to the tracer + comparison to base
comp_model = LanguageModel(model_name, device_map="auto")

with comp_model.trace(prompt):
    output_un = comp_model.output.save()

with comp_model.trace(encoded_sents):
    output_en = comp_model.output.save()

print(torch.allclose(output_un.logits, output_en.logits))
# print(torch.allclose(output_un.logits.to(device), base_outputs.logits.to(device)))
# print(base_outputs.logits.shape, output_un.logits.shape)

# This used to have a strange effect, now its here just as a check

with comp_model.trace(bos_string + " The London Bridge is in the city of"):
    output_bos = comp_model.lm_head.output.save()

with comp_model.trace("The London Bridge is in the city of"):
    output_nobos = comp_model.lm_head.output.save()

print(comp_model.tokenizer.decode(torch.argmax(output_bos, dim=-1)[0][-1]),comp_model.tokenizer.decode(torch.argmax(output_nobos, dim=-1)[0][-1]))

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


True
 London  London


In [5]:
# Get predicted word for base and nnsight
nn_logits = get_nn_logits(nn_model,layers1,encoded_sents)[-1]
base_logits = get_base_logits(hf_model, encoded_sents)[-1]

nn_pred = tokenizer.decode(torch.argmax(nn_logits, dim=-1)[0][-1])
base_pred = tokenizer.decode(torch.argmax(base_logits, dim=-1)[0][-1])

print(nn_pred, base_pred)
print(torch.allclose(nn_logits,base_logits))

NNsightError: stack expects each tensor to be equal size, but got [1, 10, 50257] at entry 0 and [1, 10, 768] at entry 1

In [6]:
# Per layer comparison of base and nnsight outputs

nn_logits = get_nn_logits(nn_model,layers1,encoded_sents)
base_logits = get_base_logits(hf_model, encoded_sents)

layer_by_layer_comp(nn_logits,base_logits)

+-------+----------+
| layer | allclose |
+-------+----------+
| 0     | True     |
| 1     | True     |
| 2     | True     |
| 3     | True     |
| 4     | True     |
| 5     | True     |
| 6     | True     |
| 7     | True     |
| 8     | True     |
| 9     | True     |
| 10    | True     |
| 11    | True     |
| 12    | True     |
+-------+----------+


In [7]:
# Per layer comparison of base and nnsight outputs
# I had to apply ln_f inversly to what it is in the base for some reason
nn_logits = get_nn_logits(nn_model,layers1,encoded_sents)
base_logits = get_base_logits(hf_model, encoded_sents)

layer_by_layer_comp(nn_logits,base_logits)

+-------+----------+
| layer | allclose |
+-------+----------+
| 0     | True     |
| 1     | True     |
| 2     | True     |
| 3     | True     |
| 4     | True     |
| 5     | True     |
| 6     | True     |
| 7     | True     |
| 8     | True     |
| 9     | True     |
| 10    | True     |
| 11    | True     |
| 12    | True     |
+-------+----------+


In [8]:
tl_logits = get_trans_lens_logits(tl_model,encoded_sents)
base_logits = get_base_logits(hf_model, encoded_sents)

layer_by_layer_comp(tl_logits,base_logits)
# tl_logits[-1].shape, base_logits[-1].shape
# len(tl_logits),len(base_logits)

+-------+----------+
| layer | allclose |
+-------+----------+
| 0     | False    |
| 1     | False    |
| 2     | False    |
| 3     | False    |
| 4     | False    |
| 5     | False    |
| 6     | False    |
| 7     | False    |
| 8     | False    |
| 9     | False    |
| 10    | False    |
| 11    | False    |
| 12    | False    |
+-------+----------+


In [120]:
print("HF embed weights [0,:5]:", hf_model.transformer.wte.weight[0,:5])
print("TL embed weights [0,:5]:", tl_model.embed.W_E[0,:5])
print("HF pos weights [0,:5]:", hf_model.transformer.wpe.weight[0,:5])
print("TL pos weights [0,:5]:", tl_model.pos_embed.W_pos[0,:5])

HF embed weights [0,:5]: tensor([-0.1101, -0.0393,  0.0331,  0.1338, -0.0485], grad_fn=<SliceBackward0>)
TL embed weights [0,:5]: tensor([-0.1106, -0.0398,  0.0326,  0.1333, -0.0490], grad_fn=<SliceBackward0>)
HF pos weights [0,:5]: tensor([-0.0188, -0.1974,  0.0040,  0.0113,  0.0638], grad_fn=<SliceBackward0>)
TL pos weights [0,:5]: tensor([-0.0134, -0.1920,  0.0095,  0.0168,  0.0693], grad_fn=<SliceBackward0>)


In [6]:
tl_model = def_trans_lens_model(model_name)

tl_model.generate(prompt, max_new_tokens=1,prepend_bos=True)

Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  cpu


  0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1/1 [00:00<00:00,  6.31it/s]


'The Eiffel Tower is in the city of Tokyo'

In [7]:
model1 = HookedTransformer.from_pretrained('gpt2', device=device)
model2 = HookedTransformer.from_pretrained('gpt2-small', device=device)

Loaded pretrained model gpt2 into HookedTransformer
Loaded pretrained model gpt2-small into HookedTransformer


In [8]:
model1.generate(prompt, max_new_tokens=1, temperature=0.7,prepend_bos=True)

100%|██████████| 1/1 [00:00<00:00,  4.69it/s]


'The Eiffel Tower is in the city of Birmingham'

In [9]:
model2.generate(prompt, max_new_tokens=1, temperature=0.7,prepend_bos=True)

100%|██████████| 1/1 [00:00<00:00,  5.32it/s]


'The Eiffel Tower is in the city of Oslo'

In [35]:
# NOT RELEVANT - just some checks for the encoding issue§

# encoded_sent = tokenizer(['Valéry', 'ValＳy'], return_tensors="pt", padding=True,
#                          add_special_tokens=False)["input_ids"].to(device)
# print(encoded_sent)
#
# import unicodedata
#
# text = "ValＳy"
# print("Before normalization:", [hex(ord(c)) for c in text])
# normalized = unicodedata.normalize("NFKC", text)
# print("After normalization:", [hex(ord(c)) for c in normalized])

tensor([[ 7762,  2634,   563, 50256, 50256],
        [ 7762,   171,   120,   111,    88]])