In [1]:
class BasicTokenizer:
    def __init__(self):
        self.merge={}
        self.vocab_size=None
        
    def getstat(self,tokens):
        count={}
        for p0,p1 in zip(tokens[:],tokens[1:]):
            count[(p0,p1)]=count.get((p0,p1),0)+1
        return count 
    def mergepair(self,ids,pair,idx):
        new_tokens=[]
        i=0
        while i<len(ids):
            if i<(len(ids)-1) and ids[i]==pair[0] and ids[i+1]==pair[1]:
                new_tokens.append(idx)
                i+=2
            else:
                new_tokens.append(ids[i])
                i+=1
        return new_tokens
    def train(self,text,vocab_size,verbose=False):
        #read the data using utf-8 encoding
        if vocab_size>=256:
            self.vocab_size=vocab_size
            tokens=list(text.encode("utf-8"))
            tokens=list(map(int,tokens)) 
            for i in range(vocab_size-256):
                getstats=self.getstat(tokens)
                maxpair=max(getstats,key=getstats.get)
                if len(maxpair):
                    tokens=self.mergepair(tokens,maxpair,256+i)
                    self.merge[maxpair]=256+i
                    if verbose:
                        print(f"Pair {maxpair} merged into {256+i}")
            vocab={k:bytes([k]) for k in range(256)}
            for k,v in self.merge.items():
                vocab[v]=vocab[k[0]]+vocab[k[1]]
            self.vocab=vocab
        else:
            print("The vocab size should be greater than or equal to 256")
        
    def encode(self,text):
        tokens=list(text.encode('utf-8'))
        while len(tokens)>=2:
            getstats=self.getstat(tokens)
            minpair=min(getstats,key=lambda x:self.merge.get(x,float('inf'))) #gets the pair in merge which has the smallest index
            if minpair not in self.merge:
                break
            tokens=self.mergepair(tokens,minpair,self.merge[minpair])
        return tokens
        

    def decode(self,ids):
        tokens_decoded=b"".join(self.vocab[x] for x in ids)
        text=tokens_decoded.decode('utf-8',errors='replace')
        return text

    def show_merge_values(self):
        for k,v in self.merge.items():
            print(f'{k}:{self.vocab[v].decode("utf-8",errors="replace")}')

In [2]:
with open('taylorswift_wikipedia.txt','r') as f:
    text=f.read()

In [3]:
tokenizer=BasicTokenizer()
tokenizer.train(text,300,True)

Pair (101, 32) merged into 256
Pair (44, 32) merged into 257
Pair (100, 32) merged into 258
Pair (46, 32) merged into 259
Pair (114, 32) merged into 260
Pair (50, 48) merged into 261
Pair (115, 32) merged into 262
Pair (105, 110) merged into 263
Pair (111, 110) merged into 264
Pair (114, 105) merged into 265
Pair (116, 32) merged into 266
Pair (116, 104) merged into 267
Pair (101, 258) merged into 268
Pair (257, 261) merged into 269
Pair (97, 110) merged into 270
Pair (97, 114) merged into 271
Pair (101, 260) merged into 272
Pair (121, 32) merged into 273
Pair (97, 108) merged into 274
Pair (267, 256) merged into 275
Pair (118, 268) merged into 276
Pair (119, 105) merged into 277
Pair (101, 114) merged into 278
Pair (264, 32) merged into 279
Pair (277, 102) merged into 280
Pair (82, 101) merged into 281
Pair (83, 280) merged into 282
Pair (111, 260) merged into 283
Pair (99, 104) merged into 284
Pair (269, 49) merged into 285
Pair (111, 109) merged into 286
Pair (98, 272) merged into 2

In [4]:
encoded_text=tokenizer.encode("Hello how are you?")
print(encoded_text)

[72, 101, 108, 108, 111, 32, 104, 111, 119, 32, 271, 256, 121, 111, 117, 63]


In [5]:
decoded_text=tokenizer.decode(encoded_text)
print(decoded_text)

Hello how are you?


In [6]:
text="hello world!!!? (안녕하세요!) lol123 😉"
text2=tokenizer.decode(tokenizer.encode(text))
print(text==text2)
print(text2)

True
hello world!!!? (안녕하세요!) lol123 😉


In [7]:
#Showing what all values were merged
tokenizer.show_merge_values()

(101, 32):e 
(44, 32):, 
(100, 32):d 
(46, 32):. 
(114, 32):r 
(50, 48):20
(115, 32):s 
(105, 110):in
(111, 110):on
(114, 105):ri
(116, 32):t 
(116, 104):th
(101, 258):ed 
(257, 261):, 20
(97, 110):an
(97, 114):ar
(101, 260):er 
(121, 32):y 
(97, 108):al
(267, 256):the 
(118, 268):ved 
(119, 105):wi
(101, 114):er
(264, 32):on 
(277, 102):wif
(82, 101):Re
(83, 280):Swif
(111, 260):or 
(99, 104):ch
(269, 49):, 201
(111, 109):om
(98, 272):ber 
(32, 275): the 
(97, 121):ay
(101, 110):en
(111, 114):or
(274, 32):al 
(101, 109):em
(46, 10):.

(265, 101):rie
(263, 103):ing
(269, 50):, 202
(116, 105):ti
(289, 108):ayl
