# Importing Libraries

In [1]:
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

tqdm.pandas()

In [2]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Load Asymptotic Counts
> Allows for us to load hash of each sequence

In [3]:
data = pd.read_parquet("results/deduped_counts.parquet")

# Calculate Unique Indicies

In [4]:
from collections import defaultdict
groups = defaultdict(list)

unique_indicies = []

In [5]:
for row in tqdm(data.itertuples(), total = len(data)):
    groups[row.Hash].append(row._1)

  0%|          | 0/146432000 [00:00<?, ?it/s]

In [6]:
removable_hashes = []
for group_hash in tqdm(groups.keys()):
    if len(groups[group_hash]) < 2:
        removable_hashes.append(group_hash)

  0%|          | 0/146025205 [00:00<?, ?it/s]

In [7]:
for group_hash in tqdm(removable_hashes):
    index = groups.pop(group_hash)[0]
    unique_indicies.append(index)

  0%|          | 0/145841274 [00:00<?, ?it/s]

In [8]:
from mmap_dataset import MMapIndexedDataset

pile_dataset = MMapIndexedDataset('/scratch/pile/deduped/document', skip_warmup = True)

def get_arr_from_idx(idx):
    return pile_dataset[idx][:64]

    reading sizes...
    reading pointers...
    reading document index...
    creating numpy buffer of mmap...
    creating memory view of numpy buffer...


In [9]:
for key, indicies in tqdm(groups.items()):
    duplicate_indicies = set()
    for i in range(len(indicies)):
        if indicies[i] in duplicate_indicies:
            continue
        
        unique_indicies.append(indicies[i])
        for j in range(i+1, len(indicies)):
            arr1 = get_arr_from_idx(indicies[i])
            arr2 = get_arr_from_idx(indicies[j])
            if (arr1 == arr2).all():
                duplicate_indicies.add(indicies[j])
        

  0%|          | 0/183931 [00:00<?, ?it/s]

In [10]:
import pickle
with open('results/deduped_unique_indicies.pkl', 'wb') as f:
    pickle.dump(unique_indicies, f)

# Load Unique Indicies

In [11]:
import pickle
with open('results/standard_unique_indicies.pkl', 'rb') as f:
    unique_indicies = set(pickle.load(f))

# Load Model Memorization Scores

In [12]:
models = ['70M', '160M', '410M', '1B', '1.4B', '2.8B', '6.9B', '12B']

In [13]:
for model in tqdm(models):
    memorization_evals = pd.read_hdf(f"/fsx/orz/memorization-evals/evals_32/memorization_{model}_143000.hdf", key = 'memorization')
    data[model] = memorization_evals['accuracy'].to_numpy()

  0%|          | 0/8 [00:00<?, ?it/s]

In [14]:
data = data[data['Index'].progress_map(lambda x:x in unique_indicies)]

  0%|          | 0/146432000 [00:00<?, ?it/s]

# Select a representative, small sample

In [15]:
data = data.sample(5_000_000)

In [16]:
data['Tokens'] = data['Index'].parallel_map(get_arr_from_idx)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=104167), Label(value='0 / 104167')â€¦

In [17]:
from datasets import Dataset

In [18]:
dataset = Dataset.from_pandas(data)

In [20]:
dataset

Dataset({
    features: ['Index', 'Offset', 'Hash', 'count', '70M', '160M', '410M', '1B', '1.4B', '2.8B', '6.9B', '12B', 'Tokens', '__index_level_0__'],
    num_rows: 5000000
})

In [21]:
dataset = dataset.remove_columns(['__index_level_0__', 'Offset', 'Hash', 'count'])

In [22]:
dataset

Dataset({
    features: ['Index', '70M', '160M', '410M', '1B', '1.4B', '2.8B', '6.9B', '12B', 'Tokens'],
    num_rows: 5000000
})

In [23]:
dataset.push_to_hub("EleutherAI/pile-duped-pythia-random-sampled", max_shard_size = '5GB')

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5000 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/3 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.
