In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tokenizers

In [3]:
from tokenizers import RegexTokenizer
tokenizer = RegexTokenizer()

In [4]:
def test_tokenizer_equivalence(text, dataset, vocab_size, chunk_size=100000, verbose=False):
    # Initialize two tokenizers with the same configuration
    tokenizer1 = RegexTokenizer()
    tokenizer2 = RegexTokenizer()

    # Train using the `train` method (single text input)
    tokenizer1.train(text, vocab_size, verbose)

    # Train using the `train_dataset` method (dataset input in chunks)
    tokenizer2.train_dataset(dataset, vocab_size, chunk_size, verbose)

    # Compare merges
    merges_same = tokenizer1.merges == tokenizer2.merges
    print("Merges Match:", merges_same)

    # Compare vocabularies
    vocab_same = tokenizer1.vocab == tokenizer2.vocab
    print("Vocabularies Match:", vocab_same)

    # Print test result
    if merges_same and vocab_same:
        print("Test Passed: Both methods produce identical results.")
    else:
        print("Test Failed: Differences found between methods.")


In [12]:
# Sample text input
text = "Sample text data for tokenizer testing. Sample text data for tokenizer testing."

# Example dataset format for `train_dataset`
# Simulate a dataset with similar content, e.g., using a list of dictionaries
dataset = [{'text': "Sample text data for tokenizer testing."} for _ in range(2)]

vocab_size = 280

test_tokenizer_equivalence(text, dataset, vocab_size, chunk_size=1, verbose=False)


100%|██████████| 24/24 [00:00<00:00, 25439.30it/s]


Data chunk is ---> Sample text data for tokenizer testing. <------
Data chunk is ---> Sample text data for tokenizer testing. <------


100%|██████████| 24/24 [00:00<00:00, 6929.39it/s]

Merges Match: True
Vocabularies Match: True
Test Passed: Both methods produce identical results.





In [13]:
file_path = 'data/taylorswift.txt'

# Load the file content
with open(file_path, 'r') as file:
    file_content = file.read()

# Set up a larger dataset by simulating multiple samples from the file content
# For simplicity, let's split by paragraphs or sentences (here assuming each line is a sentence)
dataset = [{'text': line} for line in file_content.splitlines() if line.strip()]

# Set vocabulary size and chunk size
vocab_size = 10000  # Adjust vocab size as needed for larger tests
chunk_size = 5     # Smaller chunk size for more granular testing

# Run the test function
test_tokenizer_equivalence(file_content, dataset, vocab_size, chunk_size, verbose=False)



 62%|██████▏   | 6016/9744 [09:52<06:29,  9.58it/s]

In [None]:
special_tokens = {
    '<|endoftext|>': 100257,
    '<|fim_prefix|>': 100258,
    '<|fim_middle|>': 100259,
    '<|fim_suffix|>': 100260,
    '<|endofprompt|>': 100276
}

In [None]:
#tokenizer.encode("hello world") # string -> tokens
#tokenizer.decode([1000, 2000, 3000]) # tokens -> string
#tokenizer.save("tok32k") # writes tok32k.model and tok32k.vocab
#tokenizer.load("tok32k.model") # loads the model back from disk

In [None]:
## Tokenize dataset and save for further processing

In [None]:
from datasets import load_dataset

# Load your dataset
dataset = load_dataset("your_dataset_name", split="train")

# Define special tokens
special_tokens = {"bos": "<|bos|>", "eos": "<|endoftext|>", "pad": "<|pad|>", "unk": "<|unk|>"}
special_tokens_ids = {"<|bos|>":10001,  "<|endoftext|>":10002, "<|pad|>": 10003, "unk": 10004}

# Preprocess the text to add special tokens
def add_special_tokens(example):
    example["text"] = f"{special_tokens['bos']} {example['text']} {special_tokens['eos']}"
    return example

# Apply the special tokens to the text in the dataset
dataset = dataset.map(add_special_tokens)


# Tokenize the dataset and save tokenized ids
def tokenize_function(example):
    tokens = tokenizer.encode(example["text"])
    return {"input_ids": tokens.ids}

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.save_to_disk("tokenized_dataset")
