# 📚 FineWeb-Edu-score-2


## 1.3 trillion tokens of the finest educational data the 🌐 web has to offer


Let's get started

In [1]:
import sys
import os


dirname = os.path.abspath(os.path.join(os.getcwd(), "../..", "scripts/lib"))
sys.path.append(dirname)

In [2]:
from datasets import load_dataset
from utils.tokenizer import BPETokenizer, showTokensDecode

In [3]:
fw = load_dataset("HuggingFaceFW/fineweb-edu-score-2", name="CC-MAIN-2024-10", split="train", streaming=True)

Resolving data files:   0%|          | 0/6625 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

In [4]:
fw

IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
    n_shards: 76
})

In [5]:
data = [chunk for i,chunk in zip(range(30),fw)]

In [6]:
fw.state_dict()

{'shard_idx': 0, 'shard_example_idx': 30}

## We want to train the tokenizer on this data

Let's import a lot of data.

In [7]:
data = [chunk for i,chunk in zip(range(10000),fw)]

sum([len(chunk['text']) for chunk in data])

41956906

We have now $\sim 40$ M characters.
We will aim for a vocabulary size of 1024, aka 1023 + end of text special character.

In [8]:
tokenizer = BPETokenizer(special_tokens={
    '<|endoftext|>': 1023
})

In [9]:
textData = " ".join([chunk['text'] for chunk in data])
len(textData)

41966905

In [10]:
if os.path.exists('../../saves/tokenizers/fineweb-edu-1024.tok'):
    tokenizer = BPETokenizer.load('../../saves/tokenizers/fineweb-edu-1024.tok')
    print(len(tokenizer))

1024


In [11]:
desired_length = 1024
while len(tokenizer)<desired_length:
    tokenizer.addMerges(textData, min(desired_length-len(tokenizer),20), verbose=True)
    tokenizer.save('../../saves/tokenizers/fineweb-edu-'+desired_length+'.tok')

In [12]:
tokenizer = BPETokenizer.load('../../saves/tokenizers/fineweb-edu-1024.tok')
len(tokenizer)

1024

In [13]:
showTokensDecode(tokenizer, "Hello there what do you want? Oh that's not bad to be honest")

[1m[30m[42mH[0m[1m[30m[43mell[0m[1m[30m[44mo[0m[1m[30m[45m there[0m[1m[30m[46m what[0m[1m[100m do[0m[1m[40m you[0m[1m[30m[42m want[0m[1m[30m[43m?[0m[1m[30m[44m O[0m[1m[30m[45mh[0m[1m[30m[46m that[0m[1m[100m's[0m[1m[40m not[0m[1m[30m[42m b[0m[1m[30m[43mad[0m[1m[30m[44m to[0m[1m[30m[45m be[0m[1m[30m[46m h[0m[1m[100mon[0m[1m[40mest[0m[0m


In [14]:
showTokensDecode(tokenizer, data[0]['text'][:300])

[1m[30m[42mH[0m[1m[30m[43mow[0m[1m[30m[44mever[0m[1m[30m[45m it[0m[1m[30m[46m is[0m[1m[100m not[0m[1m[40m sim[0m[1m[30m[42mp[0m[1m[30m[43mly[0m[1m[30m[44m those[0m[1m[30m[45m who[0m[1m[30m[46m are[0m[1m[100m tra[0m[1m[40md[0m[1m[30m[42mit[0m[1m[30m[43mions[0m[1m[30m[44m a[0m[1m[30m[45mw[0m[1m[30m[46may[0m[1m[100m opt[0m[1m[40mion[0m[1m[30m[42m pl[0m[1m[30m[43mans[0m[1m[30m[44m to[0m[1m[30m[45m help[0m[1m[30m[46m you[0m[1m[100m w[0m[1m[40med[0m[1m[30m[42md[0m[1m[30m[43ming[0m[1m[30m[44m just[0m[1m[30m[45m who[0m[1m[30m[46m dec[0m[1m[100ml[0m[1m[40mare[0m[1m[30m[42m that[0m[1m[30m[43m the[0m[1m[30m[44m col[0m[1m[30m[45mle[0m[1m[30m[46mge[0m[1m[100m has[0m[1m[40m start[0m[1m[30m[42med[0m[1m[30m[43m to[0m[1m[30m[44m bec[0m[1m[30m[45mome[0m[1m[30m[46m out[0m[1m[100md[0m[1m[40mated[0m[1m[30m[42m.[0m[1m[30m

In [15]:
a = len(textData)
b = len(tokenizer.encode(textData))
print(f"The compression rate is x{a/b:.2f}")

The compression rate is x2.57


<span style="font-size:2.5em;">
    To preprocess the data, execute 'make cook'
</span>