In [1]:
with open('the-verdict.txt', mode='r') as f:
    raw_data = f.read()

print(len(raw_data))

20479


## Step 1 : Creating Tokens

In [2]:
import re

exp = 'Hello, world. this is a pratical example!'
print(re.split(' ',exp))
print(re.split(r'(\s)',exp))
print(re.split(r'([.,!]|\s)',exp))
r = re.split(r'([.,!]|\s)',exp)
r = [item for item in r if item.strip()]
print(r)

['Hello,', 'world.', 'this', 'is', 'a', 'pratical', 'example!']
['Hello,', ' ', 'world.', ' ', 'this', ' ', 'is', ' ', 'a', ' ', 'pratical', ' ', 'example!']
['Hello', ',', '', ' ', 'world', '.', '', ' ', 'this', ' ', 'is', ' ', 'a', ' ', 'pratical', ' ', 'example', '!', '']
['Hello', ',', 'world', '.', 'this', 'is', 'a', 'pratical', 'example', '!']


In [3]:
exp = "Hello, world. this is -- a test?"
print(re.split(r'(\s)',exp))
result = re.split(r'([.,;:_\-"()?!\']|--|\s)', exp)
print(result)
result = [item for item in result if item.strip()]
print(result)

['Hello,', ' ', 'world.', ' ', 'this', ' ', 'is', ' ', '--', ' ', 'a', ' ', 'test?']
['Hello', ',', '', ' ', 'world', '.', '', ' ', 'this', ' ', 'is', ' ', '', '-', '', '-', '', ' ', 'a', ' ', 'test', '?', '']
['Hello', ',', 'world', '.', 'this', 'is', '-', '-', 'a', 'test', '?']


In [4]:
preprocessed = re.split(r'([.,;:_"()?!\']|--|\s)', raw_data)
preprocessed = [item for item in preprocessed if item.strip()]
print(len(preprocessed))
print(len(raw_data))

4690
20479


## Step 2 : Creating Token IDs

In [5]:
all_words = sorted(set(preprocessed))
print(len(all_words))

1130


In [6]:
vocab = {token:id for id,token in enumerate(all_words)}
print(vocab)
for i, item in enumerate(vocab.items()):
    print(item) # convert token (str) to ids (int) = Encoding

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon-dancers': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Professional': 81, 'Renaissance': 82, 'Ri

In [7]:
len(vocab)

1130

In [8]:
vocabb = {id:token for id,token in enumerate(all_words)}
print(vocabb)
for i, item in enumerate(vocabb.items()):
    print(item) # convert ids (int) to token (str) = Decoding

{0: '!', 1: '"', 2: "'", 3: '(', 4: ')', 5: ',', 6: '--', 7: '.', 8: ':', 9: ';', 10: '?', 11: 'A', 12: 'Ah', 13: 'Among', 14: 'And', 15: 'Are', 16: 'Arrt', 17: 'As', 18: 'At', 19: 'Be', 20: 'Begin', 21: 'Burlington', 22: 'But', 23: 'By', 24: 'Carlo', 25: 'Chicago', 26: 'Claude', 27: 'Come', 28: 'Croft', 29: 'Destroyed', 30: 'Devonshire', 31: 'Don', 32: 'Dubarry', 33: 'Emperors', 34: 'Florence', 35: 'For', 36: 'Gallery', 37: 'Gideon', 38: 'Gisburn', 39: 'Gisburns', 40: 'Grafton', 41: 'Greek', 42: 'Grindle', 43: 'Grindles', 44: 'HAD', 45: 'Had', 46: 'Hang', 47: 'Has', 48: 'He', 49: 'Her', 50: 'Hermia', 51: 'His', 52: 'How', 53: 'I', 54: 'If', 55: 'In', 56: 'It', 57: 'Jack', 58: 'Jove', 59: 'Just', 60: 'Lord', 61: 'Made', 62: 'Miss', 63: 'Money', 64: 'Monte', 65: 'Moon-dancers', 66: 'Mr', 67: 'Mrs', 68: 'My', 69: 'Never', 70: 'No', 71: 'Now', 72: 'Nutley', 73: 'Of', 74: 'Oh', 75: 'On', 76: 'Once', 77: 'Only', 78: 'Or', 79: 'Perhaps', 80: 'Poor', 81: 'Professional', 82: 'Renaissance', 83:

## Step 3 : Implement tokenizer V1

In [9]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {id:token for token,id in vocab.items()}

    def encode(self, text):
        ''' convert token (str) to ids (int) = Encoding '''
        preprocessed = re.split(r'([.,;:_"()?!\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[token] for token in preprocessed]
        #return a list of ids corespond for each str s based on our vocab
        return ids

    def decode(self, ids):
        ''' convert ids (int) to token (str) = Decoding '''
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [10]:
tokenizerv1 = SimpleTokenizerV1(vocab)

text = "Ah, poor Stroud--as you say. Was _that_ his history?"
print(tokenizerv1.encode(text))
print(tokenizerv1.decode(tokenizerv1.encode(text)))

[12, 5, 781, 89, 6, 177, 1126, 856, 7, 106, 114, 987, 114, 549, 550, 10]
Ah, poor Stroud -- as you say. Was _ that _ his history?


In [11]:
# what if the sentence not present in the vocab
text = "Hello, do you like tea"
# print(tokenizerv1.encode(text)) # Error
# adding special context tokens 

##### Adding special context tokens 
- <|endoftext|>
- <|unk|>

In [12]:
all_tokens = sorted(set(preprocessed))
print(len(all_tokens))
all_tokens.extend(['<|endoftext|>','<|unk|>'])
vocab2 = {token:id for id,token in enumerate(all_tokens)}
for i, item in enumerate(list(vocab2.items())[-5:]):
    print(item) 

1130
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [13]:
print(len(vocab2))

1132


In [14]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {id:token for token,id in vocab.items()}

    def encode(self, text):
        ''' convert token (str) to ids (int) = Encoding '''
        preprocessed = re.split(r'([.,;:_"()?!\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int else '<|unk|>' for item in preprocessed
        ]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids

    def decode(self, ids):
        ''' convert ids (int) to token (str) = Decoding '''
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [15]:
tokenizerv2 = SimpleTokenizerV2(vocab=vocab2)

text = "Ah, poor Stroud--as you say. Was _that_ his history?"
print(tokenizerv2.encode(text))
print(tokenizerv2.decode(tokenizerv2.encode(text)))

[12, 5, 781, 89, 6, 177, 1126, 856, 7, 106, 114, 987, 114, 549, 550, 10]
Ah, poor Stroud -- as you say. Was _ that _ his history?


In [16]:
# what if the sentence not present in the vocab = no error now
text1 = "Hello, do you like tea"
print(tokenizerv2.encode(text1))
print(tokenizerv2.decode(tokenizerv2.encode(text1)))

text2 = "In the moon"
text = " <|endoftext|> ".join((text1,text2))
print(text)
print(tokenizerv2.encode(text))
print(tokenizerv2.decode(tokenizerv2.encode(text)))

[1131, 5, 355, 1126, 628, 975]
<|unk|>, do you like tea
Hello, do you like tea <|endoftext|> In the moon
[1131, 5, 355, 1126, 628, 975, 1130, 55, 988, 1131]
<|unk|>, do you like tea <|endoftext|> In the <|unk|>


# Byte Pair Encoding

In [17]:
# !pip install tiktoken

In [20]:
import importlib
import importlib.metadata
import tiktoken

importlib.metadata.version('tiktoken')

'0.8.0'

In [46]:
tokenizer = tiktoken.get_encoding("gpt2") # size=50256

text = (
    "Hello, do you like tea? <|endoftext|> in the moon"
    " with BPE there is no problem with unkownwordd."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)


text = "ana ghadi ldar"
integers1 = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("\n",integers1)
strings1 = tokenizer.decode(integers1)
print(strings1)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 287, 262, 8824, 351, 347, 11401, 612, 318, 645, 1917, 351, 555, 74, 593, 4775, 67, 13]
Hello, do you like tea? <|endoftext|> in the moon with BPE there is no problem with unkownwordd.

 [2271, 24997, 9189, 300, 27455]
ana ghadi ldar
