### Getting the raw data

In [1]:
import urllib.request
import re


url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")

file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7ec3582fb6d0>)

### preprocessing text

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
text = "Hello, world. This, is a test."
result = re.split(r'(\s)',text)
result

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']

In [4]:
text = "Hello, world. This, is a test."
result = re.split(r'([,.]|\s)',text)
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [5]:
text2 = "Hello, world. Is this-- a test?"
result = re.split(r'[,.:;?!"()\']|--|\s', text2)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', 'world', 'Is', 'this', 'a', 'test']


In [6]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


### Creating a vocabulary

In [7]:
all_words = set(preprocessed)
vocab_size = len(all_words)
print(vocab_size)

1130


In [8]:
vocab = {token:num for num,token in enumerate(all_words)}
len(vocab)

1130

In [9]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_string = {int_:str_ for str_,int_ in self.str_to_int.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_string[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
        

In [10]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[640, 836, 87, 382, 1109, 131, 66, 203, 214, 107, 831, 214, 640, 306, 476, 237, 210, 605, 281, 70, 476]


In [11]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


# Making improvements

the concept here is to make the tokenize to take care about, the unknown words and understand where ends a sentence.

In [12]:
more_tokens = sorted(list(set(preprocessed)))
more_tokens.extend(["<|endoftext|>","<|unk|>"])
new_vocab = {token:int_ for int_,token in enumerate(more_tokens)}
print(len(new_vocab))

1132


In [13]:
for _, pair in enumerate(list(new_vocab.items())[-7:]):
    print(pair)

('yet', 1125)
('you', 1126)
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [14]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.string_to_int = vocab
        self.int_to_string = {int_:string for string,int_ in vocab.items()}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?!_"()ˇ]|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if (item in self.string_to_int) else '<|unk|>' for item in preprocessed]
        
        ids = [self.string_to_int[string] for string in preprocessed]
        
        return ids
        
        
    def decode(self,ids):
        text = " ".join([self.int_to_string[id] for id in ids])
        text = re.sub(r'\s+([,.?!_":;()\'])',r'\1', text)
        return text
        

In [15]:
teste = SimpleTokenizerV2(new_vocab)

ids = teste.encode("are you younger ,senhorita ?")
ids

[169, 1126, 1127, 5, 1131, 10]

In [16]:
teste.decode(ids)

'are you younger, <|unk|>?'

### Using toiktoken

In [17]:
import tiktoken

with open("the-verdict.txt", 'r', encoding="utf-8") as f:
    raw_text = f.read()
    

In [18]:
tokenizer = tiktoken.get_encoding("gpt2")

In [19]:
tokenizer.decode(tokenizer.encode("test if this thing working."))

'test if this thing working.'

### Making a Dataset and Dataloader classes

In [46]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_id = tokenizer.encode(txt)
        
        for i in range(0, len(token_id) - max_length, stride):
            input_chunk = token_id[i : i+max_length]
            output_chunk = token_id[i+1 : i+max_length+1]
            
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))
        
        
    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]
    
    def __len__(self):
        return len(self.input_ids)
        
            
def create_dataloader(txt,batch_size: int = 4, max_length: int=256,
                      stride: int = 128, shuffle: bool= True, drop_last: bool = True,
                      num_workers=0) -> DataLoader:
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt, tokenizer=tokenizer, max_length=max_length, stride=stride)
    
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader   

In [77]:
dataloader = create_dataloader(raw_text, batch_size=1,max_length=4,stride=1, shuffle=False)

In [78]:
data_iter = iter(dataloader)

In [79]:
print(next(data_iter))
print(next(data_iter))
print(next(data_iter))
print(next(data_iter))

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
[tensor([[2885, 1464, 1807, 3619]]), tensor([[1464, 1807, 3619,  402]])]
[tensor([[1464, 1807, 3619,  402]]), tensor([[1807, 3619,  402,  271]])]
