### Getting the raw data

In [1]:
import urllib.request
import re


url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")

file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7b8ebc7adee0>)

### preprocessing text

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
text = "Hello, world. This, is a test."
result = re.split(r'(\s)',text)
result

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']

In [4]:
text = "Hello, world. This, is a test."
result = re.split(r'([,.]|\s)',text)
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [5]:
text2 = "Hello, world. Is this-- a test?"
result = re.split(r'[,.:;?!"()\']|--|\s', text2)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', 'world', 'Is', 'this', 'a', 'test']


In [6]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


### Creating a vocabulary

In [7]:
all_words = set(preprocessed)
vocab_size = len(all_words)
print(vocab_size)

1130


In [11]:
vocab = {token:num for num,token in enumerate(all_words)}
len(vocab)

1130

In [12]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_string = {int_:str_ for str_,int_ in self.str_to_int.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_string[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
        

In [13]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[717, 372, 68, 149, 88, 782, 227, 971, 396, 1056, 903, 396, 717, 237, 267, 135, 877, 979, 566, 524, 267]


In [14]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


# Making improvements

the concept here is to make the tokenize to take care about, the unknown words and understand where ends a sentence.

In [15]:
more_tokens = sorted(list(set(preprocessed)))
more_tokens.extend(["<|endoftext|>","<|unk|>"])
new_vocab = {token:int_ for int_,token in enumerate(more_tokens)}
print(len(new_vocab))

1132


In [24]:
for _, pair in enumerate(list(new_vocab.items())[-7:]):
    print(pair)

('yet', 1125)
('you', 1126)
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [50]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.string_to_int = vocab
        self.int_to_string = {int_:string for string,int_ in vocab.items()}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?!_"()ˇ]|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if (item in self.string_to_int) else '<|unk|>' for item in preprocessed]
        
        ids = [self.string_to_int[string] for string in preprocessed]
        
        return ids
        
        
    def decode(self,ids):
        text = " ".join([self.int_to_string[id] for id in ids])
        text = re.sub(r'\s+([,.?!_":;()\'])',r'\1', text)
        return text
        

In [51]:
teste = SimpleTokenizerV2(new_vocab)

ids = teste.encode("are you younger ,senhorita ?")
ids

[169, 1126, 1127, 5, 1131, 10]

In [52]:
teste.decode(ids)

'are you younger, <|unk|>?'