In [1]:
import pandas as pd

# Loading Memorization Evals
> Memorization evals are dataframes (1 per model checkpoint)

> Each dataframe has two columnns: index of data and memorization score of data (accuracy between true continuation and model generation)

In [9]:
models = ['13b', '6.7b', '2.7b', '1.3b', '800m', '350m', '125m', '19m']
checkpoints = [23000, 43000, 63000, 83000, 103000, 123000, 143000]
checkpoint_names = ['23m', '44m', '65m', '85m', '105m', '126m', '146m']
filepath = '/mnt/ssd-1/memorization-evals'

In [10]:
import os
import pandas as pd
from tqdm.auto import tqdm

tqdm.pandas()

In [13]:
memorization_results = {}
for model in models:
    
    for checkpoint in tqdm(checkpoints, desc=model):
        filename = os.path.join(filepath, f'memorization_{model}_{checkpoint}.hdf')
        model_name = f'{model}-{checkpoint}'
        try:
            memorization_results[model_name] = pd.read_hdf(filename, key='memorization')
        except Exception as e:
            csv = pd.read_csv(os.path.join(filepath, f'memorization_results_{model}_{checkpoint}.csv'))
            csv.to_hdf(filename, key='memorization', index=False)
            memorization_results[model_name] = csv

13b:   0%|          | 0/7 [00:00<?, ?it/s]

6.7b:   0%|          | 0/7 [00:00<?, ?it/s]

2.7b:   0%|          | 0/7 [00:00<?, ?it/s]

1.3b:   0%|          | 0/7 [00:00<?, ?it/s]

800m:   0%|          | 0/7 [00:00<?, ?it/s]

350m:   0%|          | 0/7 [00:00<?, ?it/s]

125m:   0%|          | 0/7 [00:00<?, ?it/s]

19m:   0%|          | 0/7 [00:00<?, ?it/s]

# Loading Megatron Dataset

In [14]:
from megatron.data.data_utils import build_the_dataset

In [15]:
dataset = build_the_dataset(
        data_prefix = '/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document', # Replace with the path of pile document
        name = 'train_0',
        data_impl='mmap',
        num_samples=131727360,
        seq_length=2048,
        seed=1234,
        skip_warmup=True,
        build_index_mappings=False
    )

    reading sizes...
    reading pointers...
    reading document index...
    creating numpy buffer of mmap...
    creating memory view of numpy buffer...
    train_0:
     no. of documents:210604984


In [16]:
import numpy as np

In [17]:
idx_path = '/mnt/ssd-1/data/pile_20B_tokenizer/pile_20B_tokenizer_text_document_train_0_indexmap_143213ns'
dataset.doc_idx = np.load(f"{idx_path}_2048sl_1234s_doc_idx.npy")
dataset.sample_idx = np.load(f"{idx_path}_2048sl_1234s_sample_idx.npy")
dataset.shuffle_idx = np.load(f"{idx_path}_2048sl_1234s_shuffle_idx.npy")

In [18]:
dataset.shuffle_idx_len = dataset.shuffle_idx.shape[0] - 1
dataset.sample_idx_len = dataset.sample_idx.shape[0] - 1

Memorization evals are in Pandas DataFrames. We can use Pandarallel, if required to speed up .apply operations

In [6]:
!pip3 install pandarallel

Defaulting to user installation because normal site-packages is not writeable
Collecting pandarallel
  Downloading pandarallel-1.6.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25ldone
[?25h  Created wheel for pandarallel: filename=pandarallel-1.6.4-py3-none-any.whl size=16665 sha256=586501ca3dd38266e004c89363e4c2c95b7128b5aea0d283dd4314b92f140539
  Stored in directory: /home/mchorse/.cache/pip/wheels/42/4f/2f/6ef59875d9ffe4c49cdf326407a77a151336314344c4044540
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.4


In [7]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 36 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Validation

In [37]:
from transformers import AutoModelForCausalLM

In [38]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-19m")

In [23]:
nmd = memorization_results['19m-143000']

In [25]:
nmd[nmd['accuracy'] == 1]

Unnamed: 0,index,accuracy
232,232,1.0
764,764,1.0
806,806,1.0
891,891,1.0
1060,1060,1.0
...,...,...
2287433,146431433,1.0
2287569,146431569,1.0
2287580,146431580,1.0
2287652,146431652,1.0


In [28]:
input_tokens = dataset[232]['text'][:64]

In [30]:
import torch

In [42]:
model.generate(
    input_ids = torch.tensor(input_tokens[:32].reshape(1, -1)),
    temperature = 0.0,
    top_k = 0,
    top_p = 0,
    min_length = 64,
    max_length = 64
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


tensor([[  996,   186,    29,  1088,  7392,   568, 29860,  5264, 21694, 13544,
            79,   870, 21694,  2730,  8992,  2730,  2203,    31,    18,    10,
         23934, 17775,    61,    79,   870,  2203,  3073,  8992,  3073,  1088,
            31,   996,   186,    29,  1088,  7392,   568, 29860,  5264, 21694,
         13544,    79,   870, 21694,  2730,  8992,  2730,  2203,    31,    19,
            10,  6707, 17775,    61,    79,   870,  2203,  3073,  8992,  3073,
          1088,    31,   996,   186]])

In [44]:
input_tokens

array([  996,   186,    29,  1088,  7392,   568, 29860,  5264, 21694,
       13544,    79,   870, 21694,  2730,  8992,  2730,  2203,    31,
          18,    10, 23934, 17775,    61,    79,   870,  2203,  3073,
        8992,  3073,  1088,    31,   996,   186,    29,  1088,  7392,
         568, 29860,  5264, 21694, 13544,    79,   870, 21694,  2730,
        8992,  2730,  2203,    31,    19,    10,  6707, 17775,    61,
          79,   870,  2203,  3073,  8992,  3073,  1088,    31,   996,
         186])