In [50]:
import os
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import multiprocessing as mp
from datasets import load_dataset
from scipy.stats import entropy
from tqdm import tqdm
from scipy.integrate import quad
tqdm.pandas()

RECITATION_THRESHOLD = 5
SAMPLE_SEED = 20
SAMPLE_SIZE = 4

In [2]:
memories_path = "usvsnsp/generation-semantic-filters"
target_split = "memories_deduped_12b"
pile_frame = load_dataset(memories_path, split=target_split).to_pandas()
pile_frame["Memorized"] = pile_frame["memorization_score"] >= 1
pile_frame["is_code"] = pile_frame["nl_scores"].progress_apply(lambda x: x <= 0.45)
print(f"Loaded {len(pile_frame)} examples from {memories_path}/{target_split}")
pile_frame.head(1)

100%|██████████| 1871216/1871216 [00:00<00:00, 4505382.70it/s]

Loaded 1871216 examples from usvsnsp/generation-semantic-filters/memories_deduped_12b





Unnamed: 0,sequence_id,tokens,text,is_incrementing,is_repeating,sequence_duplicates,max_frequency,avg_frequency,min_frequency,median_frequency,...,0_9_templates,huffman_coding_length,memorization_score,index,loss,prompt_perplexity,generation_perplexity,sequence_perplexity,Memorized,is_code
0,1260,"[15, 681, 16, 1825, 77, 16, 285, 1239, 352, 10...",.com/apsl/ and read it before using this\n * f...,False,False,2476,11740996961,2490852000.0,1210739,331265808.0,...,2,4.698413,1.0,1260,0.509766,1.23942,1.00333,1.243547,True,True


In [3]:
def get_category(row):
    if row["Memorized"] == False:
        return "Not Memorized"
    if row["sequence_duplicates"] > RECITATION_THRESHOLD:
        return "Recitation"
    if row["is_incrementing"] or row["is_repeating"]:
        return "Reconstruction"

    return "Recollection"

pile_frame["Category"] = pile_frame.progress_apply(get_category, axis=1)
pile_frame["Category"].value_counts()

100%|██████████| 1871216/1871216 [00:05<00:00, 365837.14it/s]


Category
Recitation        1566369
Recollection       279736
Reconstruction      25111
Name: count, dtype: int64

In [4]:
pile_frame = pile_frame[["Category", "is_code", "text"]]
pile_frame

Unnamed: 0,Category,is_code,text
0,Recitation,True,.com/apsl/ and read it before using this\n * f...
1,Recitation,False,\plain\intbl\cf1\f51\fs20\ql \~\cell\pard\plai...
2,Recitation,False,\n 502. \n 503. \n 504. \n 505. \n 506. ...
3,Recitation,True,<li>ERROR_WIDGET_DISPOSED - if the receiver ha...
4,Recitation,False,1><span>I LOVE TOMATO!!!!</span></h1><br>\r\n ...
...,...,...,...
1871211,Recitation,False,own caption\n\nadd your own caption\n\nadd yo...
1871212,Recitation,True,m_LightProbeUsage: 1\n m_ReflectionProbeUsa...
1871213,Recollection,True,<AccelerateInterpolator>(jobj)\r\n\t\t{\r\n\t\...
1871214,Recitation,False,\n\nPLOS Pathogens\n\nKasturi Haldar\n\nEditor...


In [51]:
samples_data_path = "./samples"
if not os.path.exists(samples_data_path):
    os.makedirs(samples_data_path)

# set pandas display options to show full text
pd.set_option("display.max_colwidth", None)

recitation_sample = pile_frame[pile_frame["Category"] == "Recitation"].sample(SAMPLE_SIZE, random_state=SAMPLE_SEED)
display(recitation_sample)

reconstruction_sample = pile_frame[pile_frame["Category"] == "Reconstruction"].sample(SAMPLE_SIZE, random_state=SAMPLE_SEED)
display(reconstruction_sample)

recollection_sample = pile_frame[pile_frame["Category"] == "Recollection"].sample(SAMPLE_SIZE, random_state=SAMPLE_SEED)
display(recollection_sample)

Unnamed: 0,Category,is_code,text
822636,Recitation,True,"inate\Support\Facades\Auth::class,\n 'Blade' => Illuminate\Support\Facades\Blade::class,"
1103121,Recitation,True,"any cline-neutral"">&nbsp;</span>\n<span class=""cline-any cline-neutral"">&nbsp;</span>\n<span class="""
1132635,Recitation,True,{amsmath}\n \usepackage{wasysym} \n \usepackage{amsfonts} \n \usepackage{amssymb} \n \usepackage{amsbsy}\n
1578591,Recitation,False,"How to Buy\n\nAll of the products MonotaRO Singapore offers are subject to Japanese local laws, regulations, and certification standards.\nPlease make sure that"


Unnamed: 0,Category,is_code,text
759315,Reconstruction,True,"foo#hand24""\n-""http://localhost/foo#hand25""\n-""http://localhost/foo#hand26""\n-""http://localhost"
843457,Reconstruction,True,"qa\t\txmm3, xmm0\n\t\tmovdqa\t\txmm4, xmm0\n\t\tmovdqa\t\txmm5,"
1322706,Reconstruction,False,"109-ijms-15-11142],[@B110-ijms-15-11142],[@B111-ijms-15-11142],[@B112-"
1106067,Reconstruction,False,"-08762],[@B13-molecules-17-08762],[@B14-molecules-17-08762],[@B15-molecules-17-087"


Unnamed: 0,Category,is_code,text
782870,Recollection,True,.amazonaws.com/ezbatterypass/index.htmluniross-aa-2700mah-4-rechargeable-b
1589436,Recollection,True,"1,rep,name=instance_configs,json=instanceConfigs,proto3"" json:""instance_configs,omitempty""`\n\t//"
72801,Recollection,False,"vacation rentals near lovely places in Yarmouth. From HomeAway travelers, the most popular points of interest to stay for a holiday trip or just for a"
1464424,Recollection,True,"SW_SHOWNOACTIVATE""),\tSW_SHOWNOACTIVATE},\r\n\t\t//{_T(""SW_SHOWNA""),\t\t\t"


In [63]:
combined_sample = pd.concat([recitation_sample, reconstruction_sample, recollection_sample])

for i, row in combined_sample.iterrows():
    replacements = [
        ("  ", " "),
        ("\n", ""),
        ("\t", " "),
        ("\r", " "),
        ("//", ""),
        ("/*", ""),
        ("*/", ""),
        ("$", ""),
        ("{" , ""),
        ("}", ""),
        ("&", ""),
        ("%", ""),
    ]
    single_line = row["text"]
    for old, new in replacements:
        single_line = single_line.replace(old, new)
    
    if len(single_line) > 20:
        single_line = single_line[:75] + "..."
        
    single_line = f"\\verb| {single_line} |"
    
    latex_table_line = f"{row['Category']} & {single_line} \\\\"
    print(latex_table_line)
    print("\midrule")

Recitation & \verb| inate\Support\Facades\Auth::class,    'Blade' => Illuminate\Support\Facades... | \\
\midrule
Recitation & \verb| any cline-neutral">nbsp;</span><span class="cline-any cline-neutral">nbsp;<... | \\
\midrule
Recitation & \verb| amsmath        \usepackagewasysym         \usepackageamsfonts         \usep... | \\
\midrule
Recitation & \verb| How to BuyAll of the products MonotaRO Singapore offers are subject to Japa... | \\
\midrule
Reconstruction & \verb| foo#hand24"-"http:localhost/foo#hand25"-"http:localhost/foo#hand26"-"http:l... | \\
\midrule
Reconstruction & \verb| qa  xmm3, xmm0  movdqa  xmm4, xmm0  movdqa  xmm5,... | \\
\midrule
Reconstruction & \verb| 109-ijms-15-11142],[@B110-ijms-15-11142],[@B111-ijms-15-11142],[@B112-... | \\
\midrule
Reconstruction & \verb| -08762],[@B13-molecules-17-08762],[@B14-molecules-17-08762],[@B15-molecules... | \\
\midrule
Recollection & \verb| .amazonaws.com/ezbatterypass/index.htmluniross-aa-2700mah-4-rechargeable-b... | \\
\midru