In [1]:
import pickle
import random
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from elk.promptsource.templates import TemplateCollection
import torch as t


cft = load_dataset('azhx/counterfact-easy')
collection = TemplateCollection()
cft_prompts = [v for each, v in collection.datasets_templates.items() if "counterfact-easy" in each[0]][0]


# autoreload
%load_ext autoreload
%autoreload 2

# reinstall promptsourece

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/home/alex/.cache/huggingface/datasets/azhx___parquet/azhx--counterfact-easy-635c687fc0702e3e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 57.72it/s]


In [2]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", padding_side="left")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

# load the above model on to the specified GPU
device = t.device("cuda:1" if t.cuda.is_available() else "cpu")
model.to(device)

# this will take about 1 minute

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): Embedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
  

In [38]:
tokenizer.pad_token = tokenizer.eos_token

# run text through the model
def run_model(text):
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    output = model.generate(input_ids, max_length=30, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=5)
    del input_ids
    if len(output) == 1:
        return tokenizer.decode(output[0], skip_special_tokens=True)
    else:
        return tokenizer.batch_decode(output, skip_special_tokens=True)

# run a batch of text through the model
def run_batch(texts):
    input_ids = tokenizer(texts, return_tensors='pt', padding=True).input_ids.to(device)
    # get attention mask
    # use the eos token as the pad token
    attention_mask = input_ids.ne(tokenizer.eos_token_id).long().to(device)
    # model generate with attention masks
    output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, attention_mask=attention_mask, max_length=30, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=5)
    # free up memory
    del input_ids
    return tokenizer.batch_decode(output, skip_special_tokens=True)

# The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
# Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

# heres the fix


def test_single_sample_model_knowledge(sample, prompts):
    """For the sample (proposition that we are looking at) determine if the model actually has knowledge of the topic. 
    We do this by testing each prompt on the sample and determining if the response contains the desired answer specified within the prompt
    
    sample is a huggingface single dataset sample
    prompts is a promptsource prompt collection"""
    tnames = prompts.all_template_names
    for tname in tnames:
        prompt = prompts[tname]
        q, a = prompt.apply(sample)
        response = run_model(q)
        if a.lower() in response[len(q):].lower():
            return True


In [4]:
# load the counterfact dataset originally released by the ROME authors
import json
with open("./counterfact.json", "rb") as f:
    counterfact_dataset = json.load(f)

from datasets import ClassLabel
stripped_dataset = []
for each in counterfact_dataset:
    target_true = each["requested_rewrite"]["target_true"]["str"]
    target_false = each["requested_rewrite"]["target_new"]["str"]
    labels = [target_true]#[target_false, target_true]
    for i, answer in enumerate(labels):
        row = {
            "subject": each["requested_rewrite"]["subject"],
            "proposition": each["requested_rewrite"]["prompt"].format(each["requested_rewrite"]["subject"]) + " " + answer,
            "subject+predicate": each["requested_rewrite"]["prompt"].format(each["requested_rewrite"]["subject"]),
            "answer": answer,
            "label": i,
            "case_id": each["case_id"],
        }
        stripped_dataset.append(row.copy())
        #row["proposition"] = each["paraphrase_prompts"][0] + " " + answer
        #stripped_dataset.append(row.copy())
        #row["proposition"] = each["paraphrase_prompts"][1] + " " + answer
        #stripped_dataset.append(row.copy())
        # assume that each row has only two 
from datasets import Dataset, Features, Value
import random
from datasets import DatasetDict
# create a dataset with the same data, but with the columns as the keys
stripped_dict = {key: [item[key] for item in stripped_dataset] for key in stripped_dataset[0]}
hf_counterfact_stripped = Dataset.from_dict(stripped_dict, features=Features({
    'subject': Value(dtype='string', id=None),
    'proposition': Value(dtype='string', id=None),
    'label': ClassLabel(num_classes=2, names=['False', 'True']),
    'subject+predicate': Value(dtype='string', id=None),
    'answer': Value(dtype='string', id=None),
    'case_id': Value(dtype='int64', id=None)}))
# sample 10% of the counterfact dataset and make it the test split
random.seed(42)
test_indices = random.sample(range(len(hf_counterfact_stripped)), int(len(hf_counterfact_stripped) * 0.1))
test_split_stripped = hf_counterfact_stripped.select(test_indices)
train_split_stripped = hf_counterfact_stripped.select([i for i in range(len(hf_counterfact_stripped)) if i not in test_indices])
hf_dataset_stripped = DatasetDict({'train': train_split_stripped, 'test': test_split_stripped})

In [21]:
# check if case_ids are unique
assert len(hf_dataset_stripped["train"]["case_id"]) == len(set(hf_dataset_stripped["train"]["case_id"]))

In [44]:
t.cuda.memory_allocated(device=1)

24322034688

In [25]:
import gc
import torch

def find_gpu_objects():
    gpu_objects = []
    for obj in gc.get_objects():
        try:
            # Check if the object is on GPU
            if torch.is_tensor(obj) and obj.is_cuda:
                gpu_objects.append(obj)
            elif hasattr(obj, 'data') and torch.is_tensor(obj.data) and obj.data.is_cuda:
                gpu_objects.append(obj)
        except Exception as e:
            pass
    return gpu_objects

# Find GPU objects
gpu_objects = find_gpu_objects()
print('Number of GPU objects:', len(gpu_objects))

for each in gpu_objects:
    del each

t.cuda.empty_cache()


Number of GPU objects: 346




In [49]:
for each in gpu_objects:
    del each

In [54]:
t.cuda.empty_cache()

In [53]:
print(t.cuda.memory_summary(device=1))

|                  PyTorch CUDA memory summary, device ID 1                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 3            |        cudaMalloc retries: 16        |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   23195 MB |   36295 MB |  504220 MB |  481025 MB |
|       from large pool |   23192 MB |   36291 MB |  497167 MB |  473975 MB |
|       from small pool |       3 MB |       7 MB |    7052 MB |    7049 MB |
|---------------------------------------------------------------------------|
| Active memory         |   23195 MB |   36295 MB |  504220 MB |  481025 MB |
|       from large pool |   23192 MB |   36291 MB |  497167 MB |  473975 MB |
|       from small pool |       3 MB |       7 MB |    7052 MB |    7049 MB |
|---------------------------------------------------------------

In [85]:
# empty cuda cache from a specific device


21919

In [21]:
t.cuda.empty_cache()
texts = []
for each in hf_dataset_stripped["train"][0:20]['subject+predicate']:
    texts.append(each + " ")
results = run_batch(texts)

t.cuda.empty_cache()

In [86]:
from tqdm import tqdm
import pickle
# run the model on the entire dataset
results = []
for each in tqdm(stripped_dataset):
    results.append(run_model(each["subject+predicate"]))
pickle.dump(results, open("filtration_results.pkl", "wb"))

  0%|          | 0/21919 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/21919 [00:01<8:28:42,  1.39s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/21919 [00:02<7:53:38,  1.30s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/21919 [00:04<8:35:05,  1.41s/it]The attention mask and the pad token id were not set.

KeyboardInterrupt: 

In [33]:
stripped_dataset[10]["case_id"]

10

In [22]:
#input_ids = tokenizer(["The mother tongue of Danielle Darrieux is", "The mother tongue of Thomas Joannes Stieltjes is", "Autonomous University of Madrid, which is located in"], return_tensors='pt', padding=True).input_ids.to(device)
#For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer(texts, return_tensors='pt', padding=True).input_ids.to(device)
# get attention mask
attention_mask = input_ids.ne(tokenizer.pad_token_id).long().to(device)
# model generate with attention masks
output = model.generate(input_ids, attention_mask=attention_mask, max_length=30, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=5)
# free up memory
#del input_ids
#return tokenizer.batch_decode(output, skip_special_tokens=True)

#run_batch(["The mother tongue of Danielle Darrieux is", "The official religion of Edwin of Northumbria is "])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [39]:
from tqdm import tqdm
batched_dataset = [stripped_dataset[i:i + 20] for i in range(0, len(stripped_dataset), 20)]

results = []
for each in tqdm(batched_dataset):
    input_text = [x["subject+predicate"] for x in each]
    case_ids = [x["case_id"] for x in each]
    results.append({
        "case_id": case_ids,
        "completions": run_batch(input_text)})


  0%|          | 4/1096 [00:12<57:08,  3.14s/it]


KeyboardInterrupt: 

In [40]:
results

[{'case_id': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19],
  'completions': ['The mother tongue of Danielle Darrieux is French. Her ancestors had French Huguenots in their ancestry. She was',
   'The mother tongue of Danielle Darrieux is French, and one of her most popular hits of the sixties was made in',
   'The mother tongue of Danielle Darrieux is French, and she speaks English as her second language. “I never learn',
   'The mother tongue of Danielle Darrieux is French, and in her most famous movie role, she is French also. In',
   'The mother tongue of Danielle Darrieux is the Basque language\n\nBiography \n\nDarrieux was born',
   'The official religion of Edwin of Northumbria is a matter of historical record: the Old Norse-Gaelic ‘C',
   'The official religion of Edwin of Northumbria is based on a number of different traditions, including Christianity, Odinism and Heathen',
   'The official relig

In [23]:
tokenizer.batch_decode(output, skip_special_tokens=True)

['The mother tongue of Danielle Darrieux is  Française (French) (not of French, but of her mother',
 'The mother tongue of Danielle Darrieux is  French, but she is best known for her acting work in the French-',
 'The mother tongue of Danielle Darrieux is  French.\n\nFilmography\n\nFilm\n\nShort films\n Les',
 'The mother tongue of Danielle Darrieux is , while in her own country of France the name of her father is.',
 'The mother tongue of Danielle Darrieux is \nFrench, but in French-speaking African countries such as Senegal, Tunisia',
 'The official religion of Edwin of Northumbria is \nChristianity. The most influential church during his reign was St Cuth',
 'The official religion of Edwin of Northumbria is  Christianity. The cult of St Cuthbert was an important feature of the',
 'The official religion of Edwin of Northumbria is  ‘St Mary’ and ‘St John the Evangelist�',
 'The official religion of Edwin of Northumbria is  the one God (God and Christ).\n\nThere were three different fo

In [117]:
texts[5]

'The mother tongue of Thomas Joannes Stieltjes is '

In [77]:
hf_dataset_stripped["train"][3]

{'subject': 'Autonomous University of Madrid',
 'proposition': 'Autonomous University of Madrid, which is located in Spain',
 'subject+predicate': 'Autonomous University of Madrid, which is located in',
 'answer': 'Spain',
 'label': 0,
 'case_id': 3}

In [42]:
print(t.cuda.memory_allocated(device=1))

24322034688


In [6]:
# clear the GPU memory
t.cuda.empty_cache(device)
input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).input_ids.to(device)
output = model.generate(input_ids, max_length=30, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=5)


In [21]:
filtering_results = pickle.load(open("./filtration_results.pkl", "rb"))
by_case_id = {}
for i in range(len(filtering_results)):
    for j in range(len(filtering_results[i]['case_id'])):
        by_case_id[filtering_results[i]['case_id'][j]] = list(filtering_results[i]['completions'][j*5:(j*5)+5])
# make a map of answers from the stripped dataset
answers = {}
for each in stripped_dataset:
    answers[each["case_id"]] = (each["answer"], each["subject+predicate"])

def filter_by_knowledge():
    res = {}
    for k, v in by_case_id.items():
        # assume the answer is always a single word
        token, question_text = answers[k]
        assert len(token.split(' ')) == 1
        num_matched = 0
        for completion in v:
            if token.lower() in completion[len(question_text):].lower():
                num_matched += 1
        if num_matched >=3:
            res[k] = v
    return res

filtered_facts = filter_by_knowledge()

# using the filtered facts, filter the stripped dataset
filtered_stripped_dataset = [x for x in stripped_dataset if x["case_id"] in filtered_facts.keys()]

In [22]:
# load the counterfact dataset originally released by the ROME authors
import json
with open("./counterfact.json", "rb") as f:
    counterfact_dataset = json.load(f)

from datasets import ClassLabel
stripped_dataset = []
for each in counterfact_dataset:
    target_true = each["requested_rewrite"]["target_true"]["str"]
    target_false = each["requested_rewrite"]["target_new"]["str"]
    labels = [target_false, target_true]
    for i, answer in enumerate(labels):
        if each['case_id'] not in filtered_facts.keys():
            continue
        row = {
            "subject": each["requested_rewrite"]["subject"],
            "proposition": each["requested_rewrite"]["prompt"].format(each["requested_rewrite"]["subject"]) + " " + answer,
            "subject+predicate": each["requested_rewrite"]["prompt"].format(each["requested_rewrite"]["subject"]),
            "answer": answer,
            "label": i,
            "case_id": each["case_id"],
        }
        stripped_dataset.append(row.copy())
        #row["proposition"] = each["paraphrase_prompts"][0] + " " + answer
        #stripped_dataset.append(row.copy())
        #row["proposition"] = each["paraphrase_prompts"][1] + " " + answer
        #stripped_dataset.append(row.copy())
        # assume that each row has only two 
from datasets import Dataset, Features, Value
import random
from datasets import DatasetDict
# create a dataset with the same data, but with the columns as the keys
stripped_dict = {key: [item[key] for item in stripped_dataset] for key in stripped_dataset[0]}
hf_counterfact_stripped = Dataset.from_dict(stripped_dict, features=Features({
    'subject': Value(dtype='string', id=None),
    'proposition': Value(dtype='string', id=None),
    'label': ClassLabel(num_classes=2, names=['False', 'True']),
    'subject+predicate': Value(dtype='string', id=None),
    'answer': Value(dtype='string', id=None),
    'case_id': Value(dtype='int64', id=None)}))
# sample 10% of the counterfact dataset and make it the test split
random.seed(42)
test_indices = random.sample(range(len(hf_counterfact_stripped)), int(len(hf_counterfact_stripped) * 0.1))
test_split_stripped = hf_counterfact_stripped.select(test_indices)
train_split_stripped = hf_counterfact_stripped.select([i for i in range(len(hf_counterfact_stripped)) if i not in test_indices])
hf_dataset_stripped = DatasetDict({'train': train_split_stripped, 'test': test_split_stripped})

[{'subject': 'Danielle Darrieux',
  'proposition': 'The mother tongue of Danielle Darrieux is French',
  'subject+predicate': 'The mother tongue of Danielle Darrieux is',
  'answer': 'French',
  'label': 0,
  'case_id': 0},
 {'subject': 'Autonomous University of Madrid',
  'proposition': 'Autonomous University of Madrid, which is located in Spain',
  'subject+predicate': 'Autonomous University of Madrid, which is located in',
  'answer': 'Spain',
  'label': 0,
  'case_id': 3},
 {'subject': 'Thomas Joannes Stieltjes',
  'proposition': 'The mother tongue of Thomas Joannes Stieltjes is Dutch',
  'subject+predicate': 'The mother tongue of Thomas Joannes Stieltjes is',
  'answer': 'Dutch',
  'label': 0,
  'case_id': 5},
 {'subject': 'Apple A5',
  'proposition': 'Apple A5 was created by Apple',
  'subject+predicate': 'Apple A5 was created by',
  'answer': 'Apple',
  'label': 0,
  'case_id': 7},
 {'subject': 'Go Hyeon-jeong',
  'proposition': 'The mother tongue of Go Hyeon-jeong is Korean',
 

In [11]:
answers[]

{0: 'French',
 1: 'Christianity',
 2: 'guitar',
 3: 'Spain',
 4: 'Beirut',
 5: 'Dutch',
 6: 'Birmingham',
 7: 'Apple',
 8: 'Sydney',
 9: 'India',
 10: 'BBC',
 11: 'soccer',
 12: 'French',
 13: 'Ferrari',
 14: 'Korean',
 15: 'linebacker',
 16: 'Warsaw',
 17: 'Icelandic',
 18: 'Porsche',
 19: 'bishop',
 20: 'Melbourne',
 21: 'FIFA',
 22: 'Philadelphia',
 23: 'Minneapolis',
 24: 'jazz',
 25: 'Intel',
 26: 'actor',
 27: 'French',
 28: 'Antarctica',
 29: 'Europe',
 30: 'Pittsburgh',
 31: 'Canada',
 32: 'opera',
 33: 'Lyon',
 34: 'Paris',
 35: 'Ferrari',
 36: 'Russian',
 37: 'Triumph',
 38: 'Jeep',
 39: 'CBS',
 40: 'Warsaw',
 41: 'Egypt',
 42: 'politician',
 43: 'Antarctica',
 44: 'Hebrew',
 45: 'Italian',
 46: 'Florence',
 47: 'French',
 48: 'IBM',
 49: 'Nissan',
 50: 'Birmingham',
 51: 'Antarctica',
 52: 'French',
 53: 'Russia',
 54: 'Finland',
 55: 'physiology',
 56: 'Toyota',
 57: 'Gap',
 58: 'English',
 59: 'Israel',
 60: 'Canada',
 61: 'Edinburgh',
 62: 'Rome',
 63: 'English',
 64: 'ac

In [11]:
cft_prompt = cft_prompts["alex-1"]
cft_prompt.apply(cft["train"][0])


['"The mother tongue of Danielle Darrieux is English" Is that accurate? Answer with one word.',
 'No']

In [50]:
q = 'Multiple choice question: Barack Obama Was 1. The President 2. The Secretary of State"\n A:'
run_model(q)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Multiple choice question: Q: Barack Obama Was 1. The President 2. The Secretary of State"\n A: The President\n\nQuestion: President',
 'Multiple choice question: Q: Barack Obama Was 1. The President 2. The Secretary of State"\n A: Obama 2, Kerry 1\n\n',
 'Multiple choice question: Q: Barack Obama Was 1. The President 2. The Secretary of State"\n A: The President is both 3. The',
 'Multiple choice question: Q: Barack Obama Was 1. The President 2. The Secretary of State"\n A: 1. Both\n B: Both',
 'Multiple choice question: Q: Barack Obama Was 1. The President 2. The Secretary of State"\n A: 2\n\n2 \n\n',
 'Multiple choice question: Q: Barack Obama Was 1. The President 2. The Secretary of State"\n A: # 1 3. A great deal']

In [2]:
# load prompt from promptsource
with open('promptsource/templates/counterfact_easy/templates.yaml', 'rb') as f:

Found cached dataset parquet (/home/alex/.cache/huggingface/datasets/azhx___parquet/azhx--counterfact-easy-635c687fc0702e3e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 163.45it/s]
