### Getting the raw data

In [2]:
import urllib.request
import re


url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")

file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x78ca882cc280>)

### preprocessing text

In [23]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [4]:
text = "Hello, world. This, is a test."
result = re.split(r'(\s)',text)
result

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']

In [6]:
text = "Hello, world. This, is a test."
result = re.split(r'([,.]|\s)',text)
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [15]:
text2 = "Hello, world. Is this-- a test?"
result = re.split(r'[,.:;?!"()\']|--|\s', text2)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', 'world', 'Is', 'this', 'a', 'test']


In [24]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


### Creating a vocabulary

In [29]:
all_words = set(preprocessed)
vocab_size = len(all_words)
print(vocab_size)

1130


In [31]:
vocab = {token:num for num,token in enumerate(all_words)}

In [46]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_string = {int_:str_ for str_,int_ in self.str_to_int.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_string[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
        

In [47]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[441, 230, 89, 514, 1058, 533, 242, 382, 65, 485, 715, 65, 441, 985, 63, 234, 470, 368, 303, 349, 63]


In [48]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.
