In [1]:
# %pip install torch wandb transformers[torch] datasets tqdm 
%load_ext autoreload
%autoreload 2 

In [6]:
import wandb
# wandb.login(key="6f46f55bd51d76400f1e877ea7dfa75c5c7d05d6")

from transformers import GPT2TokenizerFast, GPTNeoForCausalLM, GPTNeoConfig, AutoTokenizer, AlbertTokenizer
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, load_from_disk
from transformers import RobertaForCausalLM

from tokenizers import Tokenizer, pre_tokenizers, decoders, AddedToken, normalizers, trainers
from tokenizers.normalizers import BertNormalizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer

from tokenizers.implementations import SentencePieceUnigramTokenizer

from tokenizers.processors import RobertaProcessing, TemplateProcessing
from tqdm import tqdm

from tokenizers.pre_tokenizers import Whitespace

In [6]:
def get_texts(dataset, split='train'):
    for example in dataset[split]:
        yield example['text']

In [7]:
dataset = load_dataset('deven367/babylm-100M', num_proc=16)

In [6]:
tokenizer = Tokenizer(BPE(
    dropout=None,
    unk_token="<unk>",
    continuing_subword_prefix="Ġ",
    end_of_word_suffix="",
    fuse_unk=False,
    byte_fallback=False,
    ignore_merges=False
))

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)

# Configure post-processor
tokenizer.post_processor = RobertaProcessing(
    sep=("</s>", 2),
    cls=("<s>", 1),
    trim_offsets=True,
    add_prefix_space=False
)

# Add special tokens
special_tokens = [
    AddedToken("<pad>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("<s>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("</s>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("<unk>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
]

# Create the trainer
trainer = BpeTrainer(
    vocab_size=3000, 
    special_tokens=[str(t.content) for t in special_tokens]
)

iterator = get_texts(dataset, 'train')
iterator = tqdm(iterator, total=len(dataset['train']))

tokenizer.train_from_iterator(iterator, trainer=trainer)

# Manually add the special tokens with detailed configurations
tokenizer.add_special_tokens([t.content for t in special_tokens])


100%|██████████| 10176300/10176300 [02:58<00:00, 56869.37it/s]







KeyboardInterrupt: 

In [14]:
# WordPiece Tokenizer Creator

tokenizer = Tokenizer(WordPiece(
    unk_token="<unk>"
))

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)

# Configure post-processor
tokenizer.post_processor = RobertaProcessing(
    sep=("</s>", 2),
    cls=("<s>", 1),
    trim_offsets=True,
    add_prefix_space=False
)

# Add special tokens
special_tokens = [
    AddedToken("<pad>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("<s>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("</s>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("<unk>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
]

# Create the trainer
trainer = WordPieceTrainer(
    vocab_size=20000, 
    special_tokens=[str(t.content) for t in special_tokens]
)

iterator = get_texts(dataset, 'train')
iterator = tqdm(iterator, total=len(dataset['train']))

tokenizer.train_from_iterator(iterator, trainer=trainer)

# Manually add the special tokens with detailed configurations
tokenizer.add_special_tokens([t.content for t in special_tokens])


100%|██████████| 10176300/10176300 [03:10<00:00, 53415.91it/s]







0

In [14]:
initial_alphabet = list("""!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞàáâãäåæçèéêëìíîïðĉČĠġĢģĤĥĦħĨĩĪīĬĭĮįİıĲĳĴĵĶķĸĹĺĻļĽľĿŀŁłŃ""")

def get_texts(dataset, split='train'):
    for example in dataset[split]:
        yield example['text']

# Define a function to filter out non-English text
def is_english(text):
    return all(c in initial_alphabet or c.isspace() for c in text)

# Create a generator with a progress bar and filtering
def line_generator(dataset, split='train'):
    texts = get_texts(dataset, split)
    for text in tqdm(texts, desc="Training SentencePiece Tokenizer"):
        filtered_text = ''.join([c for c in text if c in initial_alphabet or c.isspace()])
        if filtered_text:
            yield filtered_text

In [16]:
tokenizer = SentencePieceUnigramTokenizer()

# Define the normalizer and pre-tokenizer
# bert_normalizer = BertNormalizer(handle_chinese_chars=False)
tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC()])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Define the trainer for the Unigram model
special_tokens = [
    AddedToken("<pad>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("<s>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("</s>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
    AddedToken("<unk>", single_word=False, lstrip=False, rstrip=False, normalized=True, special=True),
]

# iterator = get_texts(dataset, 'train')
# iterator = tqdm(iterator, total=len(dataset['train']))
# Train the tokenizer using the generator
tokenizer.train_from_iterator(line_generator(dataset, 'train'), vocab_size=6000, special_tokens=special_tokens, unk_token="<unk>", initial_alphabet=initial_alphabet)

# Save the trained tokenizer model
tokenizer.save("sentencepiece_unigram_tokenizer")

print("Tokenizer training complete.")


Training SentencePiece Tokenizer: 10176300it [10:48, 15696.19it/s]




Tokenizer training complete.


In [13]:
import sentencepiece as spm

# Define the initial alphabet as a string
initial_alphabet = set("".join(["!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞàáâãäåæçèéêëìíîïðĉČĠġĢģĤĥĦħĨĩĪīĬĭĮįİıĲĳĴĵĶķĸĹĺĻļĽľĿŀŁłŃ "]))


cleaned_file = 'cleaned_training_data.txt'

# Train the SentencePiece model with a restricted character set
spm.SentencePieceTrainer.train(
    input=cleaned_file,
    model_prefix='sentencepiece_unigram',
    vocab_size=2999,  # Adjust the vocabulary size as needed
    model_type='unigram',
    character_coverage=1.0,
    input_sentence_size=100000,
    shuffle_input_sentence=True,
    # user_defined_symbols=list(initial_alphabet),
    pad_id=0,  # Padding ID (default is 0)
    unk_id=1,  # Unknown token ID (default is 1)
    bos_id=2,  # Beginning of sentence ID (default is 2)
    eos_id=3,  # End of sentence ID (default is 3)
    # user_defined_symbols="<mask>"
)

KeyboardInterrupt: 

In [13]:
from transformers import AlbertTokenizer, BertTokenizer, T5Tokenizer, PreTrainedTokenizerFast
import tokenizers

model_path = 'tokenizers/6k-sp/sp_unigram.model'
vocab_path = 'tokenizers/6k-sp/sp_unigram.vocab'
# config_path = 'tokenizers/sentence-piece/tokenizer_config.json'
# special_tokens_map_path = 'tokenizers/sentence-piece/special_tokens_map.json'

# Create a custom tokenizer
# tokenizer = BertTokenizer(
#     vocab_file=vocab_path,    
#     sp_model_file=model_path
# )

# tokenizer = tokenizers.SentencePieceUnigramTokenizer().from_spm("tokenizers/sentence-piece/sentencepiece_unigram.model")
# print(type(tokenizer.model))

# Load the T5Tokenizer with the SentencePiece model
# tokenizer = T5Tokenizer(vocab_file=model_path)


# Load the AlbertTokenizer with the SentencePiece model
tokenizer = AlbertTokenizer(
    sp_model_file=model_path,
    mask_token="<mask>",
    cls_token="<s>",
    sep_token="</s>",
    bos_token="<s>",
    eos_token="</s>",
    vocab_file=model_path)

print(tokenizer)

# Sample text to test the tokenizer
sample_text = "This is a test sentence to check the tokenizer."

# Encode the text
encoded = tokenizer.encode(sample_text, add_special_tokens=True)
print(f"Encoded ids: {encoded}")

# Decode the ids back to text
decoded_text = tokenizer.decode(encoded)
print(f"Decoded text: {decoded_text}")


AlbertTokenizer(name_or_path='', vocab_size=5999, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5999: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
Encoded ids: [2, 41, 20, 9, 1223, 3386, 10, 1275, 7, 10, 2769, 1495, 78, 4, 3]
Decoded te

In [1]:
tokenizer.save_pretrained("./tokenizers/3k-sp/")

NameError: name 'tokenizer' is not defined

In [7]:
test_tok = AlbertTokenizer.from_pretrained("./tokenizers/3k-sp/")

In [99]:
tokenizer.save("./tokenizers/sentence-piece/6k-sp/tokenizer.json")
tokenizer.model.save("./tokenizers/sentence-piece/6k-sp")

AttributeError: 'AlbertTokenizer' object has no attribute 'save'

In [6]:
# TOkenizer changes

import json

# Path to the input vocabulary text file
vocab_text_file = './tokenizers/20k-wp/vocab.txt'

# Path to the output JSON file
vocab_json_file = './tokenizers/20k-wp/vocab.json'

# Read the vocabulary text file and create a dictionary
vocab_dict = {}
with open(vocab_text_file, 'r', encoding='utf-8') as f:
    for idx, line in enumerate(f):
        token = line.strip()  # Remove any leading/trailing whitespace
        vocab_dict[token] = idx

# Save the dictionary to a JSON file
with open(vocab_json_file, 'w', encoding='utf-8') as f:
    json.dump(vocab_dict, f, ensure_ascii=False, indent=4)

KeyboardInterrupt: 

In [11]:
tokenizer = GPT2TokenizerFast.from_pretrained(
        'tokenizers/3k-bpe/',                                         # our custom tokenizer
        model_max_length=512    # sequence length (context window)
    )

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.
