In [14]:
import PyPDF2

with open("The_Name_of_the_Wind.pdf", "rb") as f:
    reader = PyPDF2.PdfReader(f)
    raw_text = ""
    for page in reader.pages:
        raw_text += page.extract_text()

print("Total number of characters:", len(raw_text))
print(raw_text[:99])


Total number of characters: 1394574
Table of Contents
 
 
Title Page
Copyright Page
Dedication
 
CHAPTER ONE - A Place for Demons
CHAPT


In [15]:
import re as re

In [16]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['Table', 'of', 'Contents', 'Title', 'Page', 'Copyright', 'Page', 'Dedication', 'CHAPTER', 'ONE', '-', 'A', 'Place', 'for', 'Demons', 'CHAPTER', 'TWO', '-', 'A', 'Beautiful', 'Day', 'CHAPTER', 'THREE', '-', 'Wood', 'and', 'Word', 'CHAPTER', 'FOUR', '-']


In [17]:
print(len(preprocessed))

302841


In [18]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

17110


In [19]:
vocab = {token:integer for integer,token in enumerate(all_words)}


In [20]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('(', 1)
(')', 2)
(',', 3)
('-', 4)
('-3', 5)
('.', 6)
('0', 7)
('1973', 8)
('1988', 9)
('2007', 10)
('2010', 11)
('23rd', 12)
('247', 13)
('28th', 14)
('3', 15)
('5', 16)
('5750', 17)
('7', 18)
('8705', 19)
('9', 20)
('978', 21)
('9EA', 22)
(':', 23)
(';', 24)
('?', 25)
('A', 26)
('ABENTHY', 27)
('ABOUT', 28)
('ADMISSIONS', 29)
('ADVICE', 30)
('AFTER', 31)
('AGAIN', 32)
('AINS', 33)
('ALL', 34)
('AM', 35)
('AN', 36)
('AND', 37)
('ANKER’S', 38)
('ANY', 39)
('ARCANIST', 40)
('AROUND', 41)
('AST', 42)
('AT', 43)
('AWAKE', 44)
('Aaron', 45)
('Aaron’s', 46)
('Abbott’s', 47)
('Abbot’s', 48)
('Abenthy', 49)
('Abenthy’s', 50)


In [21]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [22]:
tokenizer = SimpleTokenizerV1(vocab)

text = "Hello, do you like tea?"
print(tokenizer.encode(text))

[964, 3, 5761, 15769, 8995, 13889, 25]


In [23]:
# text = """"It's the last he painted, you know," 
#            Mrs. Gisburn said with pardonable pride."""
# ids = tokenizer.encode(text)
# print(ids)

In [24]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [25]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('”“Wolves', 17107)
('”“You', 17108)
('”“You’re', 17109)
('<|endoftext|>', 17110)
('<|unk|>', 17111)


In [26]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palacess.


In [28]:
tokenizer.encode(text)


[964,
 3,
 5761,
 15769,
 8995,
 13889,
 25,
 17110,
 1081,
 14020,
 17111,
 17111,
 10178,
 14020,
 10483,
 6]

In [29]:
tokenizer.decode(tokenizer.encode(text))

'Hello, do you like tea? <|endoftext|> In the <|unk|> <|unk|> of the palace.'