In [1]:
import pandas as pd

# Loading Memorization Evals
> Memorization evals are dataframes (1 per model checkpoint)

> Each dataframe has two columnns: index of data and memorization score of data (accuracy between true continuation and model generation)

In [2]:
import os
import pandas as pd
from tqdm.auto import tqdm

tqdm.pandas()

# Loading Megatron Dataset

In [3]:
from megatron.data.data_utils import build_the_dataset

In [4]:
config = {
    'deduped': {
        'data_prefix': '/fsx/pile_deduped/pile_0.87_deduped_text_document',
        'idxmap_path': '/fsx/pile_deduped/pile_0.87_deduped_text_document_train_0_indexmap_147164160ns'
    },
    'duped': {
        'data_prefix': '/fsx/pile/pile_20B_tokenizer_text_document',
        'idxmap_path': '/fsx/pile/pile_20B_tokenizer_text_document_test_0_indexmap_10292ns'
    }
}

In [5]:
selected_config = config['duped']

In [6]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

Memorization evals are in Pandas DataFrames. We can use Pandarallel, if required to speed up .apply operations

# Loading memorized indicies

In [7]:
models = ['19m', '125m', '350m', '800m', '1.3b', '2.7b', '6.7b', '13b']

In [8]:
duped_memorized_indicies = set()
deduped_memorized_indicies = set()

In [9]:
MEMORIZED_DATA_PATH = '/fsx/orz/memorization-evals/memorized_evals/'
for model in tqdm(models):
    duped_path = MEMORIZED_DATA_PATH + model + '.hdf'
    deduped_path = MEMORIZED_DATA_PATH + model + '-deduped.hdf'
    
    duped_indicies = pd.read_hdf(duped_path, key = 'memorization')['index'].to_numpy().tolist()
    duped_memorized_indicies.update(duped_indicies)
    
    deduped_indicies = pd.read_hdf(deduped_path, key = 'memorization')['index'].to_numpy().tolist()
    deduped_memorized_indicies.update(deduped_indicies)

  0%|          | 0/8 [00:00<?, ?it/s]

# Creating random sample

In [10]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [11]:
from numpy.random import default_rng

rng = default_rng()

In [12]:
random_sampled_indicies = pd.DataFrame()
choices = rng.choice(1024*143000, size=int(5e6), replace=False)
choices.sort()
random_sampled_indicies['index'] = choices

# Initializing Megatron Dataset

In [13]:
from functools import lru_cache

def initialize_dataset():
    dataset = build_the_dataset(
        data_prefix = selected_config['data_prefix'], # Replace with the path of pile document
        name = 'train_0',
        data_impl='mmap',
        num_samples=131727360,
        seq_length=2048,
        seed=1234,
        skip_warmup=True,
        build_index_mappings=False
    )
    idx_path = selected_config['idxmap_path']
    dataset.doc_idx = np.load(f"{idx_path}_2048sl_1234s_doc_idx.npy")
    dataset.sample_idx = np.load(f"{idx_path}_2048sl_1234s_sample_idx.npy")
    dataset.shuffle_idx = np.load(f"{idx_path}_2048sl_1234s_shuffle_idx.npy")
    
    dataset.shuffle_idx_len = dataset.shuffle_idx.shape[0] - 1
    dataset.sample_idx_len = dataset.sample_idx.shape[0] - 1
    
    return dataset

In [14]:
ds = initialize_dataset()

    reading sizes...
    reading pointers...
    reading document index...
    creating numpy buffer of mmap...
    creating memory view of numpy buffer...
    train_0:
     no. of documents:210604984


In [15]:
required_docs = {}

In [16]:
def set_doc_idx(idx):
    start_doc_idx, start_offset = ds.sample_idx[ds.shuffle_idx[idx]]
    end_doc_idx, end_offset = ds.sample_idx[ds.shuffle_idx[idx] + 1]
    if start_doc_idx == end_doc_idx:
        required_docs[(ds.doc_idx[start_doc_idx].item(), start_offset.item(), end_offset.item())] = []
    
    else:
        doc_idxes = ds.doc_idx[start_doc_idx:end_doc_idx + 1]
        for i in range(len(doc_idxes)):
            if i == 0:
                required_docs[(doc_idxes[i].item(), start_offset.item(), None)] = []
            
            elif i == (len(doc_idxes) - 1):
                required_docs[(doc_idxes[i].item(), None, end_offset.item())] = []
            
            else:
                required_docs[(doc_idxes[i].item(), None, None)] = []

In [17]:
random_sampled_indicies['index'].progress_apply(set_doc_idx)

  0%|          | 0/5000000 [00:00<?, ?it/s]

0          None
1          None
2          None
3          None
4          None
           ... 
4999995    None
4999996    None
4999997    None
4999998    None
4999999    None
Name: index, Length: 5000000, dtype: object

In [18]:
required_doc_indicies = list(required_docs.keys())

In [19]:
required_doc_indicies.sort(key = lambda x: x[0])

In [20]:
random_sampled_indicies.head()

Unnamed: 0,index
0,12
1,30
2,75
3,81
4,85


In [21]:
for i in tqdm(range(0, len(required_doc_indicies), 65536)):
    samples = required_doc_indicies[i:i + 65536]
    
    docs = ds.indexed_dataset[samples[0][0]: samples[-1][0] + 10]
    for sample in samples:
        tokens = docs[sample[0] - samples[0][0]][sample[1]:sample[2]]
        required_docs[sample] = tokens

  0%|          | 0/176 [00:00<?, ?it/s]

In [22]:
def get_tokens(idx):
    start_doc_idx, start_offset = ds.sample_idx[ds.shuffle_idx[idx]]
    end_doc_idx, end_offset = ds.sample_idx[ds.shuffle_idx[idx] + 1]
    if start_doc_idx == end_doc_idx:
        return required_docs[(ds.doc_idx[start_doc_idx].item(), start_offset.item(), end_offset.item())]
    
    else:
        doc_idxes = ds.doc_idx[start_doc_idx:end_doc_idx + 1]
        
        all_tokens = []
        for i in range(len(doc_idxes)):
            if i == 0:
                all_tokens.append(required_docs[(doc_idxes[i].item(), start_offset.item(), None)])
            
            elif i == (len(doc_idxes) - 1):
                all_tokens.append(required_docs[(doc_idxes[i].item(), None, end_offset.item())])
            
            else:
                all_tokens.append(required_docs[(doc_idxes[i].item(), None, None)])
        
        return np.concatenate(all_tokens)

In [23]:
random_sampled_indicies['tokens'] = random_sampled_indicies['index'].parallel_apply(get_tokens)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=104167), Label(value='0 / 104167')…

In [24]:
random_sampled_indicies.set_index('index', inplace = True)

In [25]:
random_sampled_indicies.head()

Unnamed: 0_level_0,tokens
index,Unnamed: 1_level_1
12,"[7742, 13, 253, 2317, 3798, 310, 6898, 387, 18..."
30,"[5962, 1318, 275, 465, 13, 337, 13, 259, 32, 1..."
75,"[4122, 14900, 2074, 8898, 9532, 15, 2053, 2590..."
81,"[13, 390, 310, 1512, 1711, 281, 320, 7607, 285..."
85,"[2014, 577, 89, 21, 313, 1237, 14, 6443, 10, 1..."


In [26]:
random_sampled_indicies['is_memorized'] = random_sampled_indicies.index.map(lambda x: x in duped_memorized_indicies)

In [27]:
random_sampled_indicies[random_sampled_indicies['is_memorized'] == True]

Unnamed: 0_level_0,tokens,is_memorized
index,Unnamed: 1_level_1,Unnamed: 2_level_1
464,"[540, 15945, 13, 2657, 50272, 2855, 13361, 89,...",True
1412,"[89, 4196, 69, 22, 317, 20, 1952, 13, 470, 89,...",True
6111,"[17736, 3287, 187, 5658, 778, 417, 897, 436, 1...",True
9310,"[17101, 5, 12780, 7955, 3140, 3, 187, 50276, 8...",True
9715,"[89, 24614, 1671, 1252, 187, 186, 12775, 64, 4...",True
...,...,...
146426042,"[1540, 15, 25206, 393, 187, 50274, 10494, 2085...",True
146426866,"[395, 247, 4869, 43923, 3885, 273, 964, 2091, ...",True
146426995,"[7, 4315, 50, 13, 50276, 10975, 11897, 54, 558...",True
146429525,"[15, 11732, 9, 70, 9679, 10131, 9, 18939, 15, ...",True


In [28]:
from datasets import Dataset

In [29]:
ds = Dataset.from_pandas(random_sampled_indicies)

In [30]:
ds

Dataset({
    features: ['tokens', 'is_memorized', 'index'],
    num_rows: 5000000
})

In [31]:
ds.push_to_hub("EleutherAI/pile-duped-pythia-random-sampled", max_shard_size = "10GB")

Pushing dataset shards to the dataset hub:   0%|          | 0/3 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/421 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


In [7]:
tokenizer = AutoTokenizer.from_pretrained("ELeutherAI/pythia-70m")

Downloading:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
tokenizer.decode(dataset[1]['text'])

' a video taken in Textra was restricted to 1 second only.\n\nFixed issue on Samsung S9 phones where in the + panel, the bottom toolbar would (on occasions) not appear.\n\nFixed issue on Samsung S9 phones where in the + panel, the full screen camera icons would (on occasions) not appear.\n\nFixed issue on Samsung S9 phones where in the + panel, long pressing the camera icon to launch the camera app, would then not return the image taken (it returned a blank square).\n\nFixed issue on some phones where conversation bubbles and message text would sometimes \'run off\' the screen, mostly this impacted HTC U11 phones.\n\nGDPR privacy compliance works.\n\nBug fixes and stability improvements.\n\nVersion 3.44 NEW FEATURE: Complete flexibility in configuring Notification Buttons as well as Notification Gestures (Tap and Clear). See Settings > Customize Notifications > Define Notification Actions. These are also configurable per conversation as well (long press a conversation, dotdotdot, Custo

# Validation

In [37]:
from transformers import AutoModelForCausalLM

In [38]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-19m")

In [23]:
nmd = memorization_results['19m-143000']

In [25]:
nmd[nmd['accuracy'] == 1]

Unnamed: 0,index,accuracy
232,232,1.0
764,764,1.0
806,806,1.0
891,891,1.0
1060,1060,1.0
...,...,...
2287433,146431433,1.0
2287569,146431569,1.0
2287580,146431580,1.0
2287652,146431652,1.0


In [28]:
input_tokens = dataset[232]['text'][:64]

In [30]:
import torch

In [42]:
model.generate(
    input_ids = torch.tensor(input_tokens[:32].reshape(1, -1)),
    temperature = 0.0,
    top_k = 0,
    top_p = 0,
    min_length = 64,
    max_length = 64
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


tensor([[  996,   186,    29,  1088,  7392,   568, 29860,  5264, 21694, 13544,
            79,   870, 21694,  2730,  8992,  2730,  2203,    31,    18,    10,
         23934, 17775,    61,    79,   870,  2203,  3073,  8992,  3073,  1088,
            31,   996,   186,    29,  1088,  7392,   568, 29860,  5264, 21694,
         13544,    79,   870, 21694,  2730,  8992,  2730,  2203,    31,    19,
            10,  6707, 17775,    61,    79,   870,  2203,  3073,  8992,  3073,
          1088,    31,   996,   186]])

In [44]:
input_tokens

array([  996,   186,    29,  1088,  7392,   568, 29860,  5264, 21694,
       13544,    79,   870, 21694,  2730,  8992,  2730,  2203,    31,
          18,    10, 23934, 17775,    61,    79,   870,  2203,  3073,
        8992,  3073,  1088,    31,   996,   186,    29,  1088,  7392,
         568, 29860,  5264, 21694, 13544,    79,   870, 21694,  2730,
        8992,  2730,  2203,    31,    19,    10,  6707, 17775,    61,
          79,   870,  2203,  3073,  8992,  3073,  1088,    31,   996,
         186])

In [3]:
!aws s3 ls s3://s-eai-neox/pythia/


An error occurred (AccessDenied) when calling the ListObjectsV2 operation: Access Denied
