In [1]:
!pip install transformers



## Using a Pretrained tokenizer from Hugging Face

In [2]:
import torch
from transformers import GPT2Tokenizer

In [3]:
# Load a pre-trained BPE tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:

text = "lower newest widest"
encoded_input = tokenizer(text, return_tensors='pt') # pt for PyTorch tensors

print("Input Text:", text)
print("Token IDs:", encoded_input['input_ids'])

Input Text: lower newest widest
Token IDs: tensor([[21037, 15530, 46232]])


In [7]:
 tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])

['lower', 'Ġnewest', 'Ġwidest']

In [9]:
tokenizer.decode(encoded_input['input_ids'][0],skip_special_tokens=False)

'lower newest widest'

## Implementation of Tokenizers from scratch
    

In [8]:
class Tokenizer:
    def encode(self, text: str) -> list[int]:
        raise NotImplementedError

    def decode(self, tokens: list[int]) -> str:
        raise NotImplementedError


In [9]:
class charTokenizer(Tokenizer):
    def encode(self, text: str) -> list[int]:
        return [ord(c) for c in text]

    def decode(self, tokens: list[int]) -> str:
        return ''.join(chr(token) for token in tokens)


In [11]:
## Test char tokenizer
tokenizer = charTokenizer()
text = "Hello World !"
tokens = tokenizer.encode(text)
print("Encoded:", tokens)
decoded = tokenizer.decode(tokens)
print("Decoded:", decoded)
assert decoded == text, "The decoded text donot match"

Encoded: [72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 32, 33]
Decoded: Hello World !


In [None]:
import regex as re
from collections import Counter

class BPETokenizer(Tokenizer):

    def __init__(self,special_tokens=['<endoftext>']) -> None:
        super().__init__()
        #merges: dict[tuple[bytes, bytes]] A list of BPE merges produced from training. Each list item
        #is a tuple of bytes (<token1>, <token2>), representing that <token1> was merged with <token2>. The merges should be ordered by order of creation.
        self.merges = {}
        # vocab maps(1:1 mapping) from integers to bytes index remember bytes argument for item needs to be in [] otherwise it will create 0 byte of length n
        self.vocab = {i:bytes([i]) for i in range(256)}
        self.special_tokens = special_tokens
        for tk in special_tokens:
            self.vocab[len(self.vocab)]=tk.encode("UTF-8")
    
    @staticmethod
    def pair_stats(counter:Counter)->dict[[tuple],int]:
        pair_counts={}
        for k,v in counter.items():
            for i in range(len(k)-1):
                pair=(k[i],k[i+1])
                pair_counts[pair]=pair_counts.get(pair,0)+v
        return pair_counts

    @staticmethod
    def merge(counter: Counter, pair: tuple, index: int) -> dict:
        pair_counts={}
        for k,v in counter.items():
            new_k=[]
            i = 0
            while i < len(k):
                if i<len(k)-1 and (k[i],k[i+1])==pair:
                    new_k.append(index)
                    i+=2
                else:
                    new_k.append(k[i])
                    i+=1
            new_k = tuple(new_k)
            pair_counts[new_k] = v        
        return pair_counts
    

    def pretokenization(self, text: str) -> list[bytes]:
        """Splits text into chunks, respecting special tokens."""
        PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
        
        if self.special_tokens:
            special_pattern = f"({'|'.join(re.escape(s) for s in self.special_tokens)})"
            chunks = re.split(special_pattern, text)
        else:
            chunks = [text]

        splits = []
        for chunk in chunks:
            if chunk in self.special_tokens:
                splits.append(chunk.encode("utf-8"))
            else:
                splits.extend(s.encode("utf-8") for s in re.findall(PAT, chunk))
        return splits


    def train_tokenizer(self, text: str, vocab_size=300) -> None:
        assert vocab_size>256, "Vocanb size should be larger than 256 (number of bytes)"
        # Implement BPE training logic here
        num_merges = vocab_size - 256
        pretokenized=self.pretokenization(text)
        pretoken_ids = [tuple(bs) for bs in pretokenized]
        idx_count = Counter(pretoken_ids)
        pair_stats =  BPETokenizer.pair_stats(idx_count)
        merges=0
        while merges<num_merges:
            best_pair = max(pair_stats, key= lambda pair: (pair_stats[pair],pair))
            if not best_pair:
                break
            # Merge the best pair
            new_token = self.vocab[best_pair[0]] + self.vocab[best_pair[1]]
            new_idx= len(self.vocab)
            ## this is best pair ids for encoding
            self.merges[(best_pair[0], best_pair[1])] = new_idx
            ## this is bytes merge token for decoding
            self.vocab[new_idx] = new_token
            # replace oriiginal pair index with new index
            idx_count=BPETokenizer.merge(idx_count, best_pair, new_idx)
            pair_stats = BPETokenizer.pair_stats(idx_count)
            merges += 1

    def encode(self, text: str) -> list[int]:
        ## apply first pretokenization
        pretokenized=self.pretokenization(text)
        pretoken_ids = [tuple(bs) for bs in pretokenized]
        ## flatten to single list 
        flatten_ids=[]
        for ids in pretoken_ids:
            while len(ids)>1:
                pairs=[(ids[i], ids[i+1]) for i in range(len(ids)-1)]
                # the pair to merge is the one which has the lowest merge index
                pair_to_merge = min(pairs, key= lambda pair: self.merges.get(pair, float('inf')))
                if pair_to_merge not in self.merges:
                    break
                merge_ids=[]
                i = 0
                while i<len(ids):
                    if i<len(ids)-1 and (ids[i],ids[i+1])==pair_to_merge:
                        merge_ids.append(self.merges[pair_to_merge])
                        i+=2
                    else:
                        merge_ids.append(ids[i])
                        i+=1
                ids=merge_ids
            # append the final ids to flatten_ids               
            flatten_ids.extend(ids)
        return flatten_ids

    def decode(self, tokens: list[int]) -> str:
        # Implement BPE decoding logic here
        bs=b"".join([self.vocab[i] for i in tokens])
        return bs.decode('utf-8', errors='replace')


In [70]:
bpe=BPETokenizer()
splits = bpe.train_tokenizer("low low low low low lower lower widest widest widest newest newest newest newest newest newest")
splits

TypeError: unhashable type: 'list'

In [None]:
text="🌎is"
bt=text.encode("UTF-8")

[] item in bt:
    print(item)

240
159
140
142
105
115


In [71]:
chr([240,159,140,142])

TypeError: an integer is required (got type list)