In [10]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset, get_dataset_split_names, DatasetDict
from tqdm import tqdm

sns.set_theme(style="darkgrid")
sns.set_context("talk")
tqdm.pandas()

## Load Datasets

In [11]:
split_sample_size = 100

In [12]:
memories_path = "usvsnsp/memories-semantic-memorization-filter-results"
get_dataset_split_names(memories_path)
memories_dataset = DatasetDict()

for split in tqdm(get_dataset_split_names(memories_path)):
    memories_dataset[split] = load_dataset(memories_path, split=f"{split}[:{split_sample_size}]" if split_sample_size else split)

memories_dataset

  0%|          | 0/16 [00:00<?, ?it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--memories-semantic-memorization-filter-results-7ad10bc8c7f6aa70/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  6%|▋         | 1/16 [00:00<00:11,  1.26it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--memories-semantic-memorization-filter-results-7ad10bc8c7f6aa70/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
 12%|█▎        | 2/16 [00:01<00:10,  1.28it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--memories-semantic-memorization-filter-results-7ad10bc8c7f6aa70/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
 19%|█▉        | 3/16 [00:02<00:10,  1.28it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--memories-semantic-m

DatasetDict({
    memories.deduped.70m: Dataset({
        features: ['sequence_id', 'text', 'sequence_duplicates', 'max_frequency', 'avg_frequency', 'min_frequency', 'median_frequency', 'p25_frequency', 'p75_frequency', 'frequencies', 'is_incrementing', 'tokens', 'repeating_offset', 'num_repeating', 'smallest_repeating_chunk', 'memorization_score', 'templating_frequency_0.9', 'templating_frequency_0.8', 'prompt_perplexity', 'generation_perplexity', 'sequence_perplexity'],
        num_rows: 100
    })
    memories.duped.6.9b: Dataset({
        features: ['sequence_id', 'text', 'sequence_duplicates', 'max_frequency', 'avg_frequency', 'min_frequency', 'median_frequency', 'p25_frequency', 'p75_frequency', 'frequencies', 'is_incrementing', 'tokens', 'repeating_offset', 'num_repeating', 'smallest_repeating_chunk', 'memorization_score', 'templating_frequency_0.9', 'templating_frequency_0.8', 'prompt_perplexity', 'generation_perplexity', 'sequence_perplexity'],
        num_rows: 100
    })
   

In [13]:
pile_path = "usvsnsp/pile-semantic-memorization-filter-results"
get_dataset_split_names(pile_path)
pile_dataset = DatasetDict()

for split in tqdm(get_dataset_split_names(pile_path)):
    pile_dataset[split] = load_dataset(pile_path, split=f"{split}[:{split_sample_size}]" if split_sample_size else split)

pile_dataset

  0%|          | 0/16 [00:00<?, ?it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--pile-semantic-memorization-filter-results-e8ad7274ba998093/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  6%|▋         | 1/16 [00:00<00:13,  1.13it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--pile-semantic-memorization-filter-results-e8ad7274ba998093/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
 12%|█▎        | 2/16 [00:01<00:12,  1.16it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--pile-semantic-memorization-filter-results-e8ad7274ba998093/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
 19%|█▉        | 3/16 [00:02<00:10,  1.20it/s]Found cached dataset parquet (/home/kyobrien/.cache/huggingface/datasets/usvsnsp___parquet/usvsnsp--pile-semantic-memorization-filt

DatasetDict({
    pile.deduped.12b: Dataset({
        features: ['sequence_id', 'text', 'sequence_duplicates', 'max_frequency', 'avg_frequency', 'min_frequency', 'median_frequency', 'p25_frequency', 'p75_frequency', 'frequencies', 'is_incrementing', 'tokens', 'repeating_offset', 'num_repeating', 'smallest_repeating_chunk', 'memorization_score', 'templating_frequency_0.9', 'templating_frequency_0.8', 'prompt_perplexity', 'generation_perplexity', 'sequence_perplexity'],
        num_rows: 100
    })
    pile.duped.6.9b: Dataset({
        features: ['sequence_id', 'text', 'sequence_duplicates', 'max_frequency', 'avg_frequency', 'min_frequency', 'median_frequency', 'p25_frequency', 'p75_frequency', 'frequencies', 'is_incrementing', 'tokens', 'repeating_offset', 'num_repeating', 'smallest_repeating_chunk', 'memorization_score', 'templating_frequency_0.9', 'templating_frequency_0.8', 'prompt_perplexity', 'generation_perplexity', 'sequence_perplexity'],
        num_rows: 100
    })
    pile.de

In [14]:
split_to_param_count = {
    "70m": 70000000,
    "160m": 160000000,
    "410m": 410000000,
    "1b": 1000000000,
    "1.4b": 1400000000,
    "2.8b": 2800000000,
    "6.9b": 6900000000,
    "12b": 12000000000,
}

In [16]:
combined_dataframe = None
for split in tqdm(memories_dataset, desc="Loading Memories"):
    current_frame = memories_dataset[split].to_pandas()
    current_frame.drop(columns=["text", "frequencies", "tokens"], inplace=True)
    current_frame["Model"] = ".".join(split.split(".")[2:])
    current_frame["Param Count"] = split_to_param_count[current_frame["Model"].iloc[0]]
    current_frame["Deduped"] = "deduped" in split
    current_frame["Memorized"] = True
    if combined_dataframe is None:
        combined_dataframe = current_frame
    else:
        combined_dataframe = pd.concat([combined_dataframe, current_frame])

for split in tqdm(pile_dataset, desc="Loading Pile"):
    current_frame = pile_dataset[split].to_pandas()
    current_frame.drop(columns=["text", "frequencies", "tokens"], inplace=True)
    current_frame["Model"] = ".".join(split.split(".")[2:])
    current_frame["Param Count"] = split_to_param_count[current_frame["Model"].iloc[0]]
    current_frame["Deduped"] = "deduped" in split
    current_frame["Memorized"] = False
    combined_dataframe = pd.concat([combined_dataframe, current_frame])

display(combined_dataframe.shape)
combined_dataframe.head()

Loading Memories: 100%|██████████| 16/16 [00:00<00:00, 243.52it/s]
Loading Pile: 100%|██████████| 16/16 [00:00<00:00, 241.79it/s]


(3200, 22)

Unnamed: 0,sequence_id,sequence_duplicates,max_frequency,avg_frequency,min_frequency,median_frequency,p25_frequency,p75_frequency,is_incrementing,repeating_offset,...,memorization_score,templating_frequency_0.9,templating_frequency_0.8,prompt_perplexity,generation_perplexity,sequence_perplexity,Model,Param Count,Deduped,Memorized
0,21590,55,11740996961,937904100.0,3053059,277329702.0,20962725,395603541,True,0,...,1.0,20,97,2.328125,1.075195,2.503906,70m,70000000,True,True
1,30252,21482,10346382453,2780063000.0,1869557,385281005.0,13592032,695610999,True,0,...,1.0,696,2332,1.542969,1.011719,1.560547,70m,70000000,True,True
2,137712,247,1659997854,179874800.0,174322,73435281.0,6991548,134386248,True,0,...,1.0,131,147,3.611328,1.052734,3.800781,70m,70000000,True,True
3,166578,91591,11740996961,1282113000.0,4597098,135808597.0,33268754,553893182,False,0,...,1.0,13541,36616,1.256836,1.0,1.256836,70m,70000000,True,True
4,185820,2,11740996961,986137300.0,1250890,164541856.5,33268754,553893182,False,0,...,1.0,1,1,2.976562,1.0,2.976562,70m,70000000,True,True


### Assign Examples to Taxonomy

In [17]:
def get_category(row):
    if row["Memorized"] == False:
        return "n/a"
    if row["sequence_duplicates"] >= 200:
        return "recitation"
    if row["is_incrementing"] or row["num_repeating"] != -1:
        return "reconstruction"

    return "recollection"

combined_dataframe["category"] = combined_dataframe.progress_apply(lambda row: get_category(row), axis=1)
combined_dataframe.value_counts("category")

100%|██████████| 3200/3200 [00:00<00:00, 129235.69it/s]


category
n/a               1600
recitation         776
recollection       677
reconstruction     147
dtype: int64

In [None]:
# code_path = "usvsnsp/pile-pythia-code-vs-nl-scores"
# code_dataset = load_dataset(code_path)["train"].to_pandas()
# code_dataset

In [None]:
# # Join combined_dataframe with code_dataset on sequence_id
# combined_dataframe = combined_dataframe.merge(code_dataset, on="sequence_id", how="inner")
# combined_dataframe["is_code"] = combined_dataframe["nl_score"] <= 0.45
# display(combined_dataframe.shape)
# combined_dataframe.head()

## Plot Graphs