In [3]:
from mmap_dataset import MMapIndexedDataset
from tqdm import trange

In [4]:
PYTHIA_SUITE = 'deduped' # Choose between 'standard' and 'deduped'

dataset_path = f'/mnt/ssd-1/pile_preshuffled/{PYTHIA_SUITE}/document'
dataset = MMapIndexedDataset(dataset_path, skip_warmup = True)

batch_size = 10240
for i in trange(0, 143000*1024, batch_size):
    batch = dataset[i:i + batch_size]

    reading sizes...
    reading pointers...
    reading document index...
    creating numpy buffer of mmap...
    creating memory view of numpy buffer...


100%|████████████████████████████████████████| 14300/14300 [00:31<00:00, 448.36it/s]


# Frequencies

In [3]:
# Libraries

import pandas as pd

# Load Token distributions

import json

with open(f'{PYTHIA_SUITE}_memorized_frequencies.json', 'r') as f:
    memorized_data = json.load(f)
    memorized_distribution = dict((token, freq) for (token, freq) in enumerate(memorized_data['memorized']))
    non_memorized_distribution = dict((token, freq) for (token, freq) in enumerate(memorized_data['non_memorized']))


# Calculate token probabilities
memorized_probabilities = [0 for _ in range(60000)]

# Calculate Token Occurances
token_occurances = [0 for _ in range(60000)]


from transformers import GPTNeoXTokenizerFast

tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/pythia-70m")

from tqdm.auto import tqdm

for (token, tok_id) in tqdm(tokenizer.vocab.items()):
    try:
        token_occurances[tok_id] = memorized_distribution[tok_id] + non_memorized_distribution[tok_id]
        memorized_probabilities[tok_id] = memorized_distribution[tok_id] / token_occurances[tok_id]
        
    
    except ZeroDivisionError:
        pass

  0%|          | 0/50277 [00:00<?, ?it/s]

In [4]:
all_occs = sum(token_occurances)

# Calculate Token Unigram Preplexities

In [5]:
unigram_perplexities = [1e20 for _ in range(60000)]
for (token, freq) in enumerate(token_occurances):
    try:
        unigram_perplexities[token] = all_occs / freq
    except ZeroDivisionError:
        unigram_perplexities[token] = 1e20 # Infinite

# Apply filters across tokens

In [6]:
max_frequencies = []
min_frequencies = []
max_perplexities = []
min_perplexities = []

In [7]:
batch_size = 10240
for i in trange(0, 143000*1024, batch_size):
    batch = dataset[i:i + batch_size][:, :64]
    for seq in batch:
        seq_frequencies = list(map(lambda x: token_occurances[x], seq))
        seq_perplexities = list(map(lambda x: unigram_perplexities[x], seq))
        max_frequencies.append(max(seq_frequencies))
        min_frequencies.append(min(seq_frequencies))
        max_perplexities.append(max(seq_perplexities))
        min_perplexities.append(min(seq_perplexities))

 45%|███████████████████                       | 6497/14300 [22:03<26:02,  4.99it/s]

In [8]:
freq_results = pd.DataFrame()

In [9]:
freq_results['max_freq'] = max_frequencies
freq_results['min_freq'] = min_frequencies
freq_results['max_unigram_perp'] = max_perplexities
freq_results['min_unigram_perp'] = min_perplexities

In [10]:
freq_results['max_freq'].describe()

In [11]:
freq_results.describe()

In [12]:
freq_results.to_hdf(f"{PYTHIA_SUITE}_freq_res.hdf", index = False, key = 'memorization')

# Results

In [1]:
from datasets import load_dataset

In [5]:
ds_type = 'duped' if PYTHIA_SUITE == 'standard' else 'deduped'

In [8]:
ds = load_dataset('EleutherAI/pythia-memorized-evals', split = f'{ds_type}.12b')

Downloading readme:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /home/orz/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/16 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/76.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/77.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/150M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/103M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/104M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/45.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/124M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/89.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/126M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/67.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/110M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/110M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/16 [00:00<?, ?it/s]

Generating deduped.1.4b split:   0%|          | 0/1048097 [00:00<?, ? examples/s]

Generating deduped.12b split:   0%|          | 0/1871215 [00:00<?, ? examples/s]

Generating deduped.160m split:   0%|          | 0/581195 [00:00<?, ? examples/s]

Generating deduped.1b split:   0%|          | 0/1032865 [00:00<?, ? examples/s]

Generating deduped.2.8b split:   0%|          | 0/1355211 [00:00<?, ? examples/s]

Generating deduped.410m split:   0%|          | 0/811039 [00:00<?, ? examples/s]

Generating deduped.6.9b split:   0%|          | 0/1680294 [00:00<?, ? examples/s]

Generating deduped.70m split:   0%|          | 0/411448 [00:00<?, ? examples/s]

Generating duped.1.4b split:   0%|          | 0/1373722 [00:00<?, ? examples/s]

Generating duped.12b split:   0%|          | 0/2382326 [00:00<?, ? examples/s]

Generating duped.160m split:   0%|          | 0/689673 [00:00<?, ? examples/s]

Generating duped.1b split:   0%|          | 0/1256141 [00:00<?, ? examples/s]

Generating duped.2.8b split:   0%|          | 0/1675077 [00:00<?, ? examples/s]

Generating duped.410m split:   0%|          | 0/970341 [00:00<?, ? examples/s]

Generating duped.6.9b split:   0%|          | 0/2120969 [00:00<?, ? examples/s]

Generating duped.70m split:   0%|          | 0/463953 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/orz/.cache/huggingface/datasets/EleutherAI___parquet/EleutherAI--pythia-memorized-evals-623aaa371a33821a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


In [10]:
import pandas as pd

In [11]:
freq_results = pd.read_hdf(f"{PYTHIA_SUITE}_freq_res.hdf", index = False, key = 'memorization')

In [25]:
len(freq_results)

146432000

In [26]:
is_memorized = [False for _ in range(len(freq_results))]

In [27]:
for idx in ds['index']:
    is_memorized[idx] = True

In [28]:
ds['index'][:10]

[224, 230, 319, 441, 447, 602, 653, 792, 824, 839]

In [31]:
freq_results['is_memorized'] = is_memorized

# Sequences with least max frequencies

In [40]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-12b")

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [38]:
freq_results.sort_values(by = 'max_freq').head(30)

Unnamed: 0,max_freq,min_freq,max_unigram_perp,min_unigram_perp,is_memorized
112869388,45572,45572,6583849.0,6583849.0,True
53063152,45572,45572,6583849.0,6583849.0,True
22820299,45572,45572,6583849.0,6583849.0,True
73580712,45572,45572,6583849.0,6583849.0,True
110195786,45572,45572,6583849.0,6583849.0,True
55960672,45572,45572,6583849.0,6583849.0,True
66190691,45572,45572,6583849.0,6583849.0,True
52182333,45572,45572,6583849.0,6583849.0,True
62592488,45572,45572,6583849.0,6583849.0,True
132751014,45572,45572,6583849.0,6583849.0,True


In [43]:
dataset[112869388][:64]

array([41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606,
       41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606,
       41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606,
       41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606,
       41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606,
       41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606,
       41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606, 41606,
       41606], dtype=uint16)

In [49]:
tokenizer.decode(dataset[112869388][:64])

'ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃ

In [39]:
dataset[94153050][:64]

array([39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922,
       39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922,
       39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922,
       39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922,
       39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922,
       39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922,
       39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922, 39922,
       39922], dtype=uint16)

In [48]:
tokenizer.decode(dataset[94153050][:64])

'145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450014514500145145001451450

# Sequences with high max frequencies

In [45]:
freq_results.sort_values(by = 'max_freq', ascending = False).head(30)

Unnamed: 0,max_freq,min_freq,max_unigram_perp,min_unigram_perp,is_memorized
0,11740996961,573895,522812.0,25.554829,False
88146716,11740996961,169879,1766193.0,25.554829,False
88146676,11740996961,537157,558568.9,25.554829,False
88146675,11740996961,1024605,292834.0,25.554829,False
88146674,11740996961,485757,617673.4,25.554829,False
88146673,11740996961,814328,368450.0,25.554829,False
88146671,11740996961,448361,669191.0,25.554829,False
88146670,11740996961,434505,690531.0,25.554829,False
88146669,11740996961,1124056,266925.5,25.554829,False
88146666,11740996961,661916,453288.9,25.554829,False


In [47]:
tokenizer.decode(dataset[0][:64])

' sisters, mothers, and wives who would be governing their lands while their husbands fought in Outremer—she tasted both fear and triumph in her wine.\n\n• • •\n\n"I do not know what I am doing here," Belle whispered. "I swore I would never be alone with you again,'

# Seqeuences with least min frequencies

In [50]:
freq_results.sort_values(by = 'min_freq').head(30)

Unnamed: 0,max_freq,min_freq,max_unigram_perp,min_unigram_perp,is_memorized
17425622,11740996961,104,2884992000.0,25.554829,False
129213805,11740996961,104,2884992000.0,25.554829,False
7207380,11740996961,104,2884992000.0,25.554829,False
83786898,11740996961,104,2884992000.0,25.554829,False
78440857,11740996961,104,2884992000.0,25.554829,False
8970130,11740996961,104,2884992000.0,25.554829,False
18792581,11740996961,104,2884992000.0,25.554829,False
5488115,11740996961,166,1807465000.0,25.554829,False
59570376,11740996961,166,1807465000.0,25.554829,False
129079102,11740996961,166,1807465000.0,25.554829,False


In [51]:
tokenizer.decode(dataset[17425622][:64])

'error].").\n\n\n15\nAs marijuana use is not "similar to" public intoxication under § 4A1.2(c)(2), the judgment of the district court is A  FFIRMED.\n\n\n\n1\n Sentences for misdemeanor and petty offenses are counted, except as follows:\n(1) Sent'

# Sequences with highest min frequencies

In [52]:
freq_results.sort_values(by = 'min_freq', ascending = False).head(30)

Unnamed: 0,max_freq,min_freq,max_unigram_perp,min_unigram_perp,is_memorized
11014236,11740996961,10346382453,28.999428,25.554829,True
127347099,11740996961,10346382453,28.999428,25.554829,True
63601894,11740996961,10346382453,28.999428,25.554829,True
16963937,11740996961,10346382453,28.999428,25.554829,True
74912559,11740996961,10346382453,28.999428,25.554829,True
40854719,11740996961,10346382453,28.999428,25.554829,True
133662364,11740996961,10346382453,28.999428,25.554829,True
66662795,11740996961,10346382453,28.999428,25.554829,True
3713617,11740996961,10346382453,28.999428,25.554829,True
27247450,11740996961,10346382453,28.999428,25.554829,True


In [54]:
tokenizer.decode(dataset[11014236][:64])

'.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n'

In [55]:
tokenizer.decode(dataset[39951715][:64])

'\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n.\n\n'