# Overview

We will download and tokenize the data and save data shards to cloud storage.

In [1]:
import os
import multiprocessing as mp
import numpy as np

import tiktoken

from datasets import load_dataset
from tqdm import tqdm


local_dir='edu_fineweb10B'
remote_name='sample-10BT'
shard_size=int(1e8) # 100M tokens per shard, total of 100 shards

# create the cache the local directory if it doesn't exist yet
DATA_CACHE_DIR = os.path.join('.', local_dir)

In [2]:
os.makedirs(DATA_CACHE_DIR, exist_ok=True) # edu_fineweb10B

# Load the dataset

In [3]:
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
ds

README.md:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

000_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

001_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

002_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

003_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

004_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

005_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

006_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

007_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

008_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

009_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

010_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

011_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

012_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

013_00000.parquet:   0%|          | 0.00/541M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9672101 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
    num_rows: 9672101
})

# Init the tokenizer

In [4]:
enc=tiktoken.get_encoding('gpt2')
eot=enc._special_tokens['<|endoftext|>'] # end of text token

In [5]:
def tokenize(doc):
    # tokenizes a single document and returns a numpy array of unit16 tokens
    tokens=[eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(enc.encode_ordinary(doc['text']))
    tokens_np=np.array(tokens)
    assert(0<=tokens_np).all() and (tokens_np < 2**16).all(), 'token dictionary too large for uint16'
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16

def write_datafile(filename, tokens_np):
    np.save(filename, tokens_np)

# tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
nprocs=max(1, os.cpu_count()//2)

with mp.Pool(nprocs) as pool:
    shard_index=0
    # preallocate buffer to hold current shard
    all_tokens_np=np.empty((shard_size,), dtype=np.uint16)
    token_count=0
    progress_bar=None
    
    for tokens in pool.imap(tokenize, ds, chunksize=16):
        if token_count+len(tokens)<shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)]=tokens
            token_count+=len(tokens)
            # update progress bar
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit='tokens', desc=f'Shard {shard_index}')
            progress_bar.update(len(tokens))
        else:
            # write the current shard and start a new one
            split = "val" if shard_index == 0 else "train"
            filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            write_datafile(filename, all_tokens_np)
            shard_index += 1
            progress_bar = None
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder

    # write any remaining tokens as the last shard
    if token_count != 0:
        split = "val" if shard_index == 0 else "train"
        filename = os.path.join(DATA_CACHE_DIR, f"edufineweb_{split}_{shard_index:06d}")
        write_datafile(filename, all_tokens_np[:token_count])

Shard 0: 100%|██████████| 100000000/100000000 [00:48<00:00, 2075473.54tokens/s]
Shard 1: 100%|█████████▉| 99997871/100000000 [00:48<00:00, 2046823.23tokens/s]
Shard 2: 100%|█████████▉| 99999982/100000000 [00:48<00:00, 2041382.26tokens/s]
Shard 3: 100%|█████████▉| 99999934/100000000 [00:48<00:00, 2056323.34tokens/s]
Shard 4: 100%|█████████▉| 99999796/100000000 [00:49<00:00, 2030387.29tokens/s]
Shard 5: 100%|█████████▉| 99998855/100000000 [00:48<00:00, 2082949.67tokens/s]
Shard 6: 100%|█████████▉| 99997594/100000000 [01:05<00:00, 1531595.64tokens/s]
Shard 7: 100%|█████████▉| 99996910/100000000 [00:49<00:00, 2017739.46tokens/s]
Shard 8: 100%|█████████▉| 99999293/100000000 [00:48<00:00, 2052942.40tokens/s]
Shard 9: 100%|█████████▉| 99997859/100000000 [00:48<00:00, 2052057.94tokens/s]
Shard 10: 100%|█████████▉| 99999807/100000000 [00:48<00:00, 2063911.06tokens/s]
Shard 11: 100%|█████████▉| 99999526/100000000 [00:48<00:00, 2060559.92tokens/s]
Shard 12: 100%|█████████▉| 99996983/100000000 [00

# Acknowledgements

* https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu
* https://www.youtube.com/watch?v=l8pRSuU81PU