**70M Dataset Preliminary Analysis**

In [9]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import plotly.express as px
from torch.nn.functional import softmax
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [12]:
class LocalMemoriesDataset(Dataset):

    def __init__(self, memories_path, tokenizer_path, is_hdf=True):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.memories = pd.read_hdf(memories_path).sample(500) if is_hdf else pd.read_csv(memories_path)
    
    def __getitem__(self, index):
        memory_record = self.memories.iloc[index]
        natural_language = self.tokenizer.decode(memory_record["tokens"])
        return natural_language
    
    def __len__(self):
        return len(self.memories)
    
memories = MemoriesDataset("/home/mchorse/kyleobrien/semantic-memorization/memorized-data/memories-pythia-duplicated/19m.hdf", "EleutherAI/pythia-70m-deduped")
data_loader = DataLoader(memories, batch_size=128)
memories[101]

"\n  <script type='text/javascript' src='../../../apidoc/javascripts/bundled/jquery.js'></script>\n<script type='text/javascript' src='../../../apidoc/javascripts/bundled/bootstrap-collapse.js'></script>\n<script"

In [13]:
device = torch.device("cuda:6")
code_classifier = AutoModelForSequenceClassification.from_pretrained("usvsnsp/code-vs-nl").to(device).eval()
code_tokenizer = AutoTokenizer.from_pretrained("usvsnsp/code-vs-nl")

In [14]:
running_nl_count = 0
dataset_length = len(memories)

with torch.no_grad():
    for text in tqdm(data_loader):
        tokenized_batch = code_tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
        outputs = code_classifier(**tokenized_batch)
        probabilities = softmax(outputs.logits.detach())
        natural_language_count = (probabilities[:, 0] > 0.457414).sum().item()
        running_nl_count += natural_language_count


Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.

100%|██████████| 4/4 [00:01<00:00,  2.60it/s]


In [15]:
nl_percent = 100 * running_nl_count / dataset_length
print(f"~{nl_percent}% of the memories are natural language")

frame = pd.DataFrame({"Types": ["NL", "Code"], "Counts": [running_nl_count, len(memories) - running_nl_count]})
fig = px.pie(data_frame=frame,  names="Types", values="Counts", title="Memories Makeup", width=500, height=500)
fig.show()

~24.4% of the memories are natural language


In [16]:
save_json = {
    "nlCount": running_nl_count,
    "codeCount": running_nl_count,
}

frame.to_json()

'{"Types":{"0":"NL","1":"Code"},"Counts":{"0":122,"1":378}}'

In [6]:
# Memory Percents

import os
import plotly.express as px
import pandas as pd

nl_percents = [25, 24.2, 24.1, 24.3, 24.3, 24.8, 25.8, 26.2]
data = pd.DataFrame({ "Models": models, "NL Percents": percents })
fig = px.line(data, x="Models", y="NL Percents", markers=True, text="NL Percents", title='Percent of Memories = "Natural Language" by Model')
fig.update_traces(textposition="bottom right")
fig.show()

In [7]:
data_dir = "../memorized-data/memories-pythia-duplicated"
models = ["19m", "125m", "350m", "800m", "1.3b", "2.7b", "6.7b", "13b"]
total_memories_count = [len(pd.read_hdf(f"{data_dir}/{file_name}.hdf")) for file_name in models]
total_memories_count

[463953, 689673, 970341, 1256141, 1373722, 1675077, 2120969, 2382326]

In [40]:
# Memory Counts

import os
import plotly.express as px
import pandas as pd

nl_percents = [25, 24.2, 24.1, 24.3, 24.3, 24.8, 25.8, 26.2]
nl_counts = [int((percent / 100) * total) for (percent, total) in zip(nl_percents, total_memories_count)]
code_counts = [total - nl_count for (nl_count, total) in zip(nl_counts, total_memories_count)]
# types = ["NL", "Code", "NL", "Code","NL", "Code","NL", "Code"] * 2
types = ["NL"] * 8 + ["Code"] * 8 + ["Total"] * 8
counts = nl_counts + code_counts + total_memories_count
# total_memories_count[0] - (code_counts[0] + nl_counts[0])
# dup_models = ["19m", "19m","125m", "125m","350m", "350m","800m", "800m","1.3b", "1.3b","2.7b", "2.7b","6.7b", "6.7b","13b", "13b"]
dup_models = models * 3
data = pd.DataFrame({ "Models": dup_models, "Type": types , "Count": counts })
data


Unnamed: 0,Models,Type,Count
0,19m,NL,115988
1,125m,NL,166900
2,350m,NL,233852
3,800m,NL,305242
4,1.3b,NL,333814
5,2.7b,NL,415419
6,6.7b,NL,547210
7,13b,NL,624169
8,19m,Code,347965
9,125m,Code,522773


In [43]:
fig = px.line(data, x="Models", y="Count", color="Type", markers=True, text="Count", title='Memories by Binary Surface Form')
fig.update_traces(textposition="bottom right")
fig.show()

In [26]:
["19m", "19m","125m", "125m","350m", "350m","800m", "800m","1.3b", "1.3b","2.7b", "2.7b","6.7b", "6.7b","13b", "13b"]

NameError: name 'sort' is not defined

# Deduped Dataset Preliminary Analysis

In [1]:
from datasets import load_dataset

dataset = load_dataset("EleutherAI/pythia-memorized-evals")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 1.55k/1.55k [00:00<00:00, 1.16MB/s]
Using custom data configuration EleutherAI--pythia-memorized-evals-623aaa371a33821a


Downloading and preparing dataset None/None to /home/mchorse/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 89.0M/89.0M [00:02<00:00, 43.2MB/s]
Downloading data: 100%|██████████| 67.9M/67.9M [00:01<00:00, 45.1MB/s]
Downloading data: 100%|██████████| 68.1M/68.1M [00:01<00:00, 45.1MB/s]
Downloading data: 100%|██████████| 91.6M/91.6M [00:02<00:00, 44.2MB/s]
Downloading data: 100%|██████████| 91.9M/91.9M [00:01<00:00, 46.0MB/s]
Downloading data: 100%|██████████| 103M/103M [00:02<00:00, 41.6MB/s]t]
Downloading data: 100%|██████████| 104M/104M [00:02<00:00, 43.4MB/s]
Downloading data: 100%|██████████| 125M/125M [00:02<00:00, 46.2MB/s]t]
Downloading data: 100%|██████████| 126M/126M [00:02<00:00, 45.8MB/s]
Downloading data: 100%|██████████| 124M/124M [00:02<00:00, 43.8MB/s]t]
Downloading data: 100%|██████████| 125M/125M [00:02<00:00, 43.5MB/s]
Downloading data: 100%|██████████| 125M/125M [00:02<00:00, 45.0MB/s]
Downloading data: 100%|██████████| 133M/133M [00:03<00:00, 44.2MB/s]t]
Downloading data: 100%|██████████| 133M/133M [00:02<00:00, 45.6MB/s]
Downloading data

Dataset parquet downloaded and prepared to /home/mchorse/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|██████████| 16/16 [00:00<00:00, 41.13it/s]


In [32]:
memories_datasets = load_dataset("EleutherAI/pythia-memorized-evals")
[split_name for split_name in memories_datasets]
memories_datasets["duped.160m"].to_pandas().sample(100)

Using custom data configuration EleutherAI--pythia-memorized-evals-623aaa371a33821a
Found cached dataset parquet (/home/mchorse/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 16/16 [00:00<00:00, 41.68it/s]


Unnamed: 0,index,tokens,__index_level_0__
486563,103301148,"[323, 3081, 1491, 5001, 9451, 187, 475, 50275,...",341148
227920,48387030,"[27589, 9026, 10987, 30335, 2593, 1738, 996, 1...",339030
122315,25977437,"[19295, 4110, 33278, 9149, 3003, 28827, 43227,...",809437
73726,15620501,"[22, 13391, 13, 22, 12730, 13, 22, 13210, 13, ...",1892501
533301,113314241,"[5803, 25666, 5935, 13, 3003, 25900, 187, 605,...",1202241
...,...,...,...
644536,136920883,"[2490, 50276, 2358, 15, 2490, 50276, 2090, 15,...",1928883
156746,33320190,"[644, 2011, 281, 320, 625, 6422, 1254, 1311, 8...",1288190
646575,137342264,"[13, 17, 13, 17, 13, 17, 13, 17, 13, 17, 13, 1...",62264
53216,11281325,"[281, 253, 24739, 18687, 1159, 669, 8604, 60, ...",2129325


In [29]:
for split_name in memories_datasets:
    isDeduped = split_name.startswith("deduped")
    model = split_name.split("duped.")[-1]
    corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}"
    tokenizer = AutoTokenizer.from_pretrained(corresponding_model)

Downloading: 100%|██████████| 394/394 [00:00<00:00, 557kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.24MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 56.7kB/s]


EleutherAI/pythia-160m


Downloading: 100%|██████████| 394/394 [00:00<00:00, 227kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 5.65MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 91.1kB/s]


EleutherAI/pythia-410m


Downloading: 100%|██████████| 394/394 [00:00<00:00, 280kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.39MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 56.9kB/s]


EleutherAI/pythia-1b


Downloading: 100%|██████████| 394/394 [00:00<00:00, 415kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.65MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 71.7kB/s]


EleutherAI/pythia-2.8b-deduped


Downloading: 100%|██████████| 394/394 [00:00<00:00, 284kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:01<00:00, 2.03MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 62.3kB/s]


EleutherAI/pythia-2.8b


Downloading: 100%|██████████| 394/394 [00:00<00:00, 265kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.84MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 66.9kB/s]


EleutherAI/pythia-12b


Downloading: 100%|██████████| 394/394 [00:00<00:00, 313kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.63MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 96.0kB/s]


EleutherAI/pythia-6.9b-deduped


Downloading: 100%|██████████| 394/394 [00:00<00:00, 366kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 4.77MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 112kB/s]


EleutherAI/pythia-160m-deduped


Downloading: 100%|██████████| 394/394 [00:00<00:00, 280kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.15MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 78.8kB/s]


EleutherAI/pythia-1.4b


Downloading: 100%|██████████| 394/394 [00:00<00:00, 225kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 5.35MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 65.7kB/s]


EleutherAI/pythia-1b-deduped


Downloading: 100%|██████████| 394/394 [00:00<00:00, 286kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.12MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 63.4kB/s]


EleutherAI/pythia-410m-deduped
EleutherAI/pythia-70m


Downloading: 100%|██████████| 394/394 [00:00<00:00, 258kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.28MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 54.0kB/s]


EleutherAI/pythia-1.4b-deduped


Downloading: 100%|██████████| 394/394 [00:00<00:00, 262kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.32MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 61.1kB/s]


EleutherAI/pythia-12b-deduped
EleutherAI/pythia-70m-deduped


Downloading: 100%|██████████| 394/394 [00:00<00:00, 490kB/s]
Downloading: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.39MB/s]
Downloading: 100%|██████████| 99.0/99.0 [00:00<00:00, 79.0kB/s]

EleutherAI/pythia-6.9b





In [19]:
class HFMemoriesDataset(Dataset):
    def __init__(self, memories_path, split, tokenizer_path):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.memories = load_dataset(memories_path, split=split)
    
    def __getitem__(self, index):
        memory_record = self.memories[index]
        decoded_text = self.tokenizer.decode(memory_record["tokens"])
        return decoded_text
    
    def __len__(self):
        return len(self.memories)
    

deduped70m_pytorch_dataset = HFMemoriesDataset("EleutherAI/pythia-memorized-evals", "deduped.70m", "EleutherAI/pythia-70m-deduped")
deduped70m_pytorch_dataset[24]

Using custom data configuration EleutherAI--pythia-memorized-evals-623aaa371a33821a
Found cached dataset parquet (/home/mchorse/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


" more believers than you know what to do with! You'll\nsee! Until then, take care of yourself, ya big furball!\n~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~"