# Data Preparation for an LLM

![Pipeline](images/pipeline.png)

![Data Preparation](images/data_preparation.png)

In [82]:
import urllib.request
import re
import tiktoken

## Get Data

In [26]:
URL = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
    "the-verdict.txt"
)

FILE_PATH = "data/the-verdict.txt"

In [27]:
with urllib.request.urlopen(URL) as response:
    raw_text = response.read().decode("utf-8")

In [39]:
print(f"Total number of characters: {len(raw_text):,}")
print("")
print(raw_text[:99])

Total number of characters: 20,479

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Tokenize Data

In [29]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text) # split by punctuation and whitespace
preprocessed = [item.strip() for item in preprocessed if item.strip()] # remove empty strings, whitespace, newlines, tabs and combinations

4690


In [40]:
print(f"Total number of characters: {len(preprocessed):,}")
print("")
print(preprocessed[:23])

Total number of characters: 4,690

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise']


## Create Vocabulary

In [42]:
all_words = sorted(set(preprocessed))
vocab = {token:integer for integer,token in enumerate(all_words)}

In [53]:
print(f"Total number of unique words: {len(vocab):,}")
print("")
for k, v in vocab.items():
    print(f"{k} : {v}")
    if v == 15:
        break

Total number of unique words: 1,130

! : 0
" : 1
' : 2
( : 3
) : 4
, : 5
-- : 6
. : 7
: : 8
; : 9
? : 10
A : 11
Ah : 12
Among : 13
And : 14
Are : 15


## Create Simple Tokenizer

In [54]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v: k for k, v in vocab.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):         #4
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [55]:
tokenizerV1 = SimpleTokenizerV1(vocab)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [56]:
text = """"It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."""
ids = tokenizerV1.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]


In [57]:
print(tokenizerV1.decode(ids))

" It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.


## Create Simpler Tokenizer with Out-of-Vocab words support

In [62]:
text = "Hello, do you like tea?" # Hello is a out-of-vocab word
print(tokenizerV1.encode(text))

KeyError: 'Hello'

In [67]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [70]:
print(f"Total number of unique words: {len(vocab):,}") # 2 extra tokens in vocab

Total number of unique words: 1,132


In [71]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)    #2
        return text

In [72]:
tokenizerV2 = SimpleTokenizerV2(vocab)

In [79]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [80]:
ids = tokenizerV2.encode(text)
print(ids)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [81]:
print(tokenizerV2.decode(ids))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## Byte Pair Encoding

In [83]:
tokenizer = tiktoken.get_encoding("gpt2")

In [84]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [85]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


## Data Sampling