In [29]:
texts = [
    "Hello, how are you?",
    "I am fine, thank you!",
    "Let's create a custom tokenizer.",
    "Tokenization is fun and useful.",
    "This is an example of training a tokenizer from scratch.",
    "Machine learning can be applied in many fields.",
    "Natural language processing involves analyzing text data.",
    "Deep learning models understand complex patterns in data.",
    "Data science is an interdisciplinary field combining statistics, computer science, and domain expertise.",
    "Python is a versatile and powerful programming language.",
    "Jupyter Notebooks are great for interactive and exploratory coding.",
    "Artificial intelligence is transforming various industries.",
    "Open source libraries accelerate innovation in technology.",
    "The quick brown fox jumps over the lazy dog.",
    "Building a tokenizer requires careful text preprocessing and experimentation.",
    "We can add special tokens like <im_user> to highlight specific parts of the text.",
    "Tokenizers enable efficient text representation for language models.",
    "Clean and diverse datasets are crucial for training robust models.",
    "Experimentation and iteration are key to success in machine learning projects.",
    "Always test your models with new and unseen data to ensure generalization.",
    "Custom tokenization can help tailor the model to your specific dataset.",
    "Data augmentation techniques improve the performance of machine learning models.",
    "Learning how to tokenize text is a fundamental skill in natural language processing.",
    "Preprocessing text data helps remove noise and irrelevant characters.",
    "A well-trained tokenizer forms the basis for many language processing tasks."]


In [30]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

tokenizer.decoder = decoders.ByteLevel()

special_tokens = ["<pad>", "<unk>", "<im_user>"]

trainer = trainers.BpeTrainer(vocab_size=1000, special_tokens=special_tokens)

tokenizer.train_from_iterator(texts, trainer=trainer)







In [31]:
sample_text = "This is an unseen new text"
encoding = tokenizer.encode(sample_text)

print("Token IDs:", encoding.ids)
print("Tokens:", encoding.tokens)


Token IDs: [478, 97, 68, 448, 442, 104]
Tokens: ['This', 'Ġis', 'Ġan', 'Ġunseen', 'Ġnew', 'Ġtext']


In [32]:
tokenizer.save("custom_tokenizer.json")

In [28]:
from tokenizers import Tokenizer
loaded_tokenizer = Tokenizer.from_file("custom_tokenizer.json")

loaded_encoding = loaded_tokenizer.encode(sample_text)
print("Loaded tokenizer tokens:", loaded_encoding.tokens)

Loaded tokenizer tokens: ['This', 'Ġis', 'Ġan', 'Ġunseen', 'Ġnew', 'Ġtext']


In [4]:
import json

with open("./llm.c/dev/data/tinystories/TinyStories_all_data/data49.json", "r") as file:
    json_data = json.load(file)

print(json_data)  

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [5]:
json_data[0]

{'story': 'Once upon a time, in a small house, there lived a little girl named Sue. Sue had a pretty scarf. She loved her scarf very much. It was soft and gentle. She wore it every day.\nOne day, her friend Tim came to play. Tim saw the scarf and liked it too. He said, "Please, can I wear the scarf too?" Sue was happy to share. She said, "Yes, please be gentle with my scarf."\nThey played all day with the scarf. They ran, they jumped, and they laughed. They were very careful and gentle with the scarf. At the end of the day, Tim gave the scarf back to Sue. He said, "Thank you for sharing your scarf with me."\nSue smiled and said, "You\'re welcome. I\'m glad we could share my scarf and have fun together." They hugged and said goodbye. Sue went inside her house, still wearing her gentle scarf, feeling happy and warm.',
 'instruction': {'prompt:': 'Write a short story (3-5 paragraphs) which only uses very simple words that a 3 year old child would understand. The story should use the verb 

In [11]:
import numpy as np

def read_datafile(filename, model_desc="gpt-2"):
    """
    Reads a binary token file written by `write_datafile()`.
    """
    info = {
        "gpt-2": {"token_dtype": np.uint16},  # 2 bytes per token
        "llama-3": {"token_dtype": np.uint32} # 4 bytes per token
    }
    
    assert model_desc in info, f"Unknown model descriptor {model_desc}"
    
    with open(filename, "rb") as f:
        header = np.frombuffer(f.read(1024), dtype=np.int32)
        
        magic_number = header[0]
        version = header[1]
        num_tokens = header[2]

        print(f"Magic Number: {magic_number}")
        print(f"Version: {version}")
        print(f"Number of Tokens: {num_tokens}")

        token_dtype = info[model_desc]["token_dtype"]
        tokens = np.frombuffer(f.read(num_tokens * np.dtype(token_dtype).itemsize), dtype=token_dtype)
    
    return tokens

tokens = read_datafile("./llm.c/dev/data/tinyshakespeare/tiny_shakespeare_val.bin", model_desc="gpt-2")  # or "llama-3"
print(f"Loaded {len(tokens):,} tokens")
print(tokens[:10]) 


Magic Number: 20240520
Version: 1
Number of Tokens: 32768
Loaded 32,768 tokens
[50256  5962 22307    25   198  8421   356  5120   597  2252]


In [13]:
len(tokens)

32768