In [1]:
!pip install transformers



## Using a Pretrained tokenizer from Hugging Face

In [2]:
import torch
from transformers import GPT2Tokenizer

In [3]:
# Load a pre-trained BPE tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:

text = "lower newest widest"
encoded_input = tokenizer(text, return_tensors='pt') # pt for PyTorch tensors

print("Input Text:", text)
print("Token IDs:", encoded_input['input_ids'])

Input Text: lower newest widest
Token IDs: tensor([[21037, 15530, 46232]])


In [7]:
 tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])

['lower', 'Ġnewest', 'Ġwidest']

In [9]:
tokenizer.decode(encoded_input['input_ids'][0],skip_special_tokens=False)

'lower newest widest'

## Implementation of Tokenizers from scratch
    

In [8]:
class Tokenizer:
    def encode(self, text: str) -> list[int]:
        raise NotImplementedError

    def decode(self, tokens: list[int]) -> str:
        raise NotImplementedError


In [9]:
class charTokenizer(Tokenizer):
    def encode(self, text: str) -> list[int]:
        return [ord(c) for c in text]

    def decode(self, tokens: list[int]) -> str:
        return ''.join(chr(token) for token in tokens)


In [11]:
## Test char tokenizer
tokenizer = charTokenizer()
text = "Hello World !"
tokens = tokenizer.encode(text)
print("Encoded:", tokens)
decoded = tokenizer.decode(tokens)
print("Decoded:", decoded)
assert decoded == text, "The decoded text donot match"

Encoded: [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 32, 33]
Decoded: Hello World !


In [None]:
import regex as re

class BPETokenizer(Tokenizer):

    def __init__(self) -> None:
        super().__init__()
        # BPE merges a list of tuples goes from two bytes to new combined byte
        self.merges = []
        # vocab maps(1:1 mapping) from integers to bytes index
        self.bytes2idx = {i:bytes(i) for i in range(256)}
        self.idx2bytes = {v:k for k,v in self.bytes2idx.items()}

    @classmethod
    def pretokenization(cls,text:str)->list[bytes]:
        """returns count of bytes to int"""
        PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
        splits=re.findall(PAT, text)
        splits = [s.encode('utf-8') for s in splits]
        return splits


    def train_tokenizer(self, text: str) -> None:
        # Implement BPE training logic here
        pass

    def encode(self, text: str) -> list[int]:
        # Implement BPE encoding logic here
        pass

    def decode(self, tokens: list[int]) -> str:
        # Implement BPE decoding logic here
        pass

In [43]:
bpe=BPETokenizer()
splits = bpe.pretokenization("somaae text that i'll pre-tokenasaizea!")
splits

['somaae', ' text', ' that', ' i', "'ll", ' pre', '-', 'tokenasaizea', '!']