In [1]:
from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm import tqdm
import plotly.express as px
import pandas as pd
import torch
import os

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class HFMemoriesDataset(Dataset):
    is_dataframe = False

    def __init__(self, memories, tokenizer, sample=None):
        self.tokenizer = tokenizer
        self.memories = memories
        if sample is not None:
            self.memories = self.memories.to_pandas().sample(sample)
            self.is_dataframe = True

    def __getitem__(self, index):
        memory_record = (
            self.memories.iloc[index] if self.is_dataframe else self.memories[index]
        )
        decoded_text = self.tokenizer.decode(memory_record["tokens"])
        return decoded_text

    def __len__(self):
        return len(self.memories)


def load_tokenizer(split_name):
    isDeduped = split_name.startswith("deduped")
    model = split_name.split("duped.")[-1]
    corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}"
    tokenizer =  AutoTokenizer.from_pretrained(corresponding_model)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def load_model(split_name):
    isDeduped = split_name.startswith("deduped")
    model = split_name.split("duped.")[-1]
    corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}"
    return GPTNeoXForCausalLM.from_pretrained(corresponding_model).to(torch.device("cuda:7")).eval()

In [3]:
split_name = "deduped.160m"
memories = load_dataset("EleutherAI/pythia-memorized-evals")[split_name]
tokenizer = load_tokenizer(split_name)
memories_dataset = HFMemoriesDataset(
    load_dataset("EleutherAI/pythia-memorized-evals")[split_name], 
    tokenizer)

pythia_model = load_model(split_name)

Found cached dataset parquet (/home/mchorse/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 16/16 [00:00<00:00, 30.38it/s]
Found cached dataset parquet (/home/mchorse/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 16/16 [00:00<00:00, 30.14it/s]


In [None]:
def calculate_perplexity(logits, labels):
    shift_logits = logits.detach()[:-1, :].contiguous()
    shift_labels = labels[1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss()
    cross_entropy = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    perplexity = torch.exp(cross_entropy)
    return perplexity

data_loader = DataLoader(memories_dataset, batch_size=100)
all_perplexities = []

with torch.no_grad():
    for batch in tqdm(data_loader):
        tokenized_batch = tokenizer(
            batch, return_tensors="pt", max_length=512, truncation=True, padding=True
        )
        tokenized_batch.to(torch.device("cuda:7"))
        labels = tokenized_batch["input_ids"][:, 1:].contiguous()

        outputs = pythia_model(**tokenized_batch, labels=tokenized_batch["input_ids"])
        logits = outputs.logits.detach()

        labels = tokenized_batch["input_ids"]
        perplexities = [calculate_perplexity(logits[i], labels[i]) for i in range(len(logits))]
        all_perplexities += [perplexity.item() for perplexity in perplexities]

print(len(all_perplexities))
all_perplexities

 13%|█▎        | 1141/9082 [02:50<19:44,  6.71it/s]


In [None]:
memories_df = memories.to_pandas()
memories_df["perplexity"] = all_perplexities
memories_df