# Preliminary Analysis

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from torch import argmax
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
hdf_file = pd.read_hdf("./memorized-data/19m.hdf")
hdf_file

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
pythia_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
hdf_file["tokens"].sample(20).apply(lambda record: pythia_tokenizer.decode(record))

In [None]:
code_model = AutoModelForSequenceClassification.from_pretrained("usvsnsp/code-vs-nl")
code_tokenizer = AutoTokenizer.from_pretrained("usvsnsp/code-vs-nl")

In [None]:
license_sequence = pythia_tokenizer.decode(hdf_file.iloc[1000]["tokens"])
code_sequence = pythia_tokenizer.decode(hdf_file.iloc[1]["tokens"])
error_sequence = pythia_tokenizer.decode(hdf_file.iloc[2]["tokens"])
custom_nl_seqeuence = "This is an experiment by EleutherAI"
print("\n[Seqeuence 1]" + license_sequence + "\n")
print("\n[Seqeuence 2]" + code_sequence + "\n")
print("\n[Seqeuence 3]" + error_sequence + "\n")
print("\n[Seqeuence 4]" + custom_nl_seqeuence + "\n")

In [None]:
license_sequence_label = argmax(code_model(code_tokenizer.encode(license_sequence, return_tensors="pt")).logits, axis=1).item()
code_sequence_label = argmax(code_model(code_tokenizer.encode(code_sequence, return_tensors="pt")).logits, axis=1).item()
error_sequence_label = argmax(code_model(code_tokenizer.encode(error_sequence, return_tensors="pt")).logits, axis=1).item()
custom_nl_seqeuence_label = argmax(code_model(code_tokenizer.encode(custom_nl_seqeuence, return_tensors="pt")).logits, axis=1).item()

print(license_sequence_label)
print(code_sequence_label)
print(error_sequence_label)
print(custom_nl_seqeuence_label)

In [None]:
nl_dataset = hdf_file
nl_dataset["text"] = nl_dataset["tokens"].progress_apply(lambda tokens: pythia_tokenizer.decode(tokens))
nl_dataset

# Build Code Classifier Evaluation Dataset

In order to have traction analyzing all the memorized data, we need to be able to use an accurate classifier to syft through the data. This notebook creates an evaluation dataset using 500 samples selected from all the unique memorized sequences. 250 of the samples are squences that Orz's code classifier determiens was positive, and 250 that aren't labeled as code.

In [25]:
import os

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from torch import argmax

In [6]:
data_path = "memorized-data"
unique_memories = None

for dataset in os.listdir(data_path):
    print(f"Loading {dataset} from memory")
    model_dataset_path = f"{data_path}/{dataset}"
    hdf_file = pd.read_hdf(f"./memorized-data/{dataset}")

    if unique_memories is None:
        unique_memories = hdf_file
    else:
        unique_memories = pd.concat([unique_memories, hdf_file]).drop_duplicates(["index"])
    
    print(f"Added memorized sequences from {dataset} now totaling {len(unique_memories)} sequences")


Loading 19m.hdf from memory
Added memorized sequences from 19m.hdf now totaling 463953 sequences
Loading 2.7b.hdf from memory
Added memorized sequences from 2.7b.hdf now totaling 1697368 sequences
Loading 6.7b.hdf from memory
Added memorized sequences from 6.7b.hdf now totaling 2308812 sequences
Loading 350m.hdf from memory
Added memorized sequences from 350m.hdf now totaling 2336246 sequences
Loading 13b.hdf from memory
Added memorized sequences from 13b.hdf now totaling 2734597 sequences
Loading 800m.hdf from memory
Added memorized sequences from 800m.hdf now totaling 2758391 sequences
Loading 1.3b.hdf from memory
Added memorized sequences from 1.3b.hdf now totaling 2782412 sequences
Loading 125m.hdf from memory
Added memorized sequences from 125m.hdf now totaling 2788948 sequences


In [13]:
pythia_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
unique_memories["text"] = unique_memories["tokens"].progress_apply(lambda tokens: pythia_tokenizer.decode(tokens))
unique_memories

100%|██████████| 2788948/2788948 [09:48<00:00, 4739.59it/s]


Unnamed: 0,index,accuracy,tokens,text
232,232,1.0,"[996, 186, 29, 1088, 7392, 568, 29860, 5264, 2...","\n\t\t<read echo=""ascii""><delim>\n</delim><mat..."
764,764,1.0,"[599, 17585, 423, 92, 3728, 12945, 423, 92, 33...",}}).\end{array}$$\end{document}$$$$\documentcl...
806,806,1.0,"[313, 39386, 27, 19939, 428, 5270, 310, 1239, ...",(errno: 165 - Table is read only)\nERROR HY00...
891,891,1.0,"[94, 187, 50262, 61, 2099, 92, 8798, 94, 187, ...",}\n \usepackage{amsmath}\n ...
1060,1060,1.0,"[4022, 305, 48095, 4477, 15, 187, 475, 187, 47...",2016 gRPC authors.\n *\n * Licensed under the...
...,...,...,...,...
2120948,146264948,1.0,"[783, 346, 17736, 3287, 2379, 475, 368, 778, 4...","the ""License"");\r\n * you may not use this fil..."
2147503,146291503,1.0,"[50276, 38097, 20386, 390, 20524, 6651, 329, 2...",MERCHANTABILITY or FITNESS FOR A PARTICULAR ...
2209889,146353889,1.0,"[77, 11, 426, 4150, 746, 23759, 1797, 187, 502...",l* = −19→21\n 4492 measured reflections ...
2232160,146376160,1.0,"[15, 187, 475, 187, 475, 25936, 2382, 285, 897...",.\n *\n * Redistribution and use in source and...


In [17]:
unique_memories.to_hdf(f"{data_path}/all_memorized_sequences.hdf", key="memories", mode="w")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['tokens', 'text'], dtype='object')]

  unique_memories.to_hdf(f"{data_path}/all_memorized_sequences.hdf", key="memories", mode="w")


In [47]:
def get_code_score(text):
    tokens = code_tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    logits = code_detector(tokens).logits
    pos_score = round(softmax(logits, dim=1)[0][1].item(), 2)
    return pos_score
    

code_detector = AutoModelForSequenceClassification.from_pretrained("usvsnsp/code-vs-nl")
code_tokenizer = AutoTokenizer.from_pretrained("usvsnsp/code-vs-nl")
downsized_memories = unique_memories.sample(10000)
downsized_memories["code_scores"] = downsized_memories["text"].progress_apply(lambda text: get_code_score(text))
downsized_memories = downsized_memories.drop(["accuracy", "tokens"], axis=1)

100%|██████████| 10000/10000 [04:05<00:00, 40.67it/s]


In [50]:
downsized_memories.to_csv(f"{data_path}/downsizes_sequences.csv", mode="w")

In [53]:
positives = downsized_memories[downsized_memories["code_scores"] >= 0.5].sample(250)
positives

Unnamed: 0,index,text,code_scores
2517,82370517,"nr,RhsStorageOrder> pack_rhs;\n\n for(Index...",0.60
262216,90638216,ologies.orient.core.id.ORecordId;\nimport com....,0.57
694713,102510713,4fc53ed82b4e75901d4_5\n- :distance: 335\n :fi...,0.56
46088,49524088,c\n../requests/__pycache__/__version__.cpython...,0.62
908058,28364058,document}$ and $\documentclass[12pt]{minimal}\...,0.60
...,...,...,...
723160,142579160,"7</a></div>\r\n <div><a href=""#"">Item ...",0.60
658902,121922902,56\n\tETHERTYPE_BCLOOP = 0x90...,0.58
29855,128157855,"ribution and use in source and binary forms, w...",0.56
164489,115136489,"bitand_< bitand_< N1,N2 >, N3>\n{\n BOOST_...",0.59


In [54]:
negatives = downsized_memories[downsized_memories["code_scores"] < 0.5].sample(250)
negatives

Unnamed: 0,index,text,code_scores
16032,8024032,comes with a sizable cost. For us to continue...,0.47
545214,29145214,Newton touchdown ball<|endoftext|>J-S28008-18...,0.46
2130233,18146233,no prize?\nThere's no prize? There's no prize...,0.45
187913,7337913,essays on global warming from different angle...,0.45
845337,16861337,Zero\n\nEnjoy reading WOLF STREET and want to...,0.49
...,...,...,...
72367,51838367,"I am now a fan.\n\nWishing you well,\n\nLaura...",0.47
117851,124813851,1560440772390713810515859307960866701724271218...,0.49
1983096,38591096,"DAMAGES\n * (INCLUDING, BUT NOT LIMITED TO, P...",0.49
182970,69394970,"R. 2008,, 681, 626-635\n\nHughes, A. M., Wilne...",0.48


In [57]:
pd.concat([positives, negatives]).sample(frac=1).to_csv(f"{data_path}/evaluation_set_soft_labels.csv", mode="w")