In [1]:
pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import PyPDF2

pdf_path = "sustainable_development.pdf"

# Open the PDF file in binary mode
with open(pdf_path, "rb") as pdf_file:  
    reader = PyPDF2.PdfReader(pdf_file)
    
    # Extract text from all pages
    raw_text = ""
    for page in reader.pages:
        raw_text += page.extract_text()
    
    print("Total number of characters:", len(raw_text))
    print(raw_text[:99])  # Print the first 99 characters




Total number of characters: 9587
Global Rewilding Initiative: Rewilding People & Nature
Learn how one person, even of limited means,


Our goal is to tokenize 9587 characters into individual words and special characters that we can then turn into embeddings for LLM training

#### An Example of Text Splitting ####

In [3]:
import re # This library(regular expression) helps split the text to help obtain a list of tokens

text = "Hello, world. This is a test."
result = re.split(r'(\s)', text) # (\s) splits when whitespaces are encountered

print(result) 

['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']


['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']
This is the results printed out, which is a list of individual words,whitespaces and punctuation characters

Let's modify the regular expression splits on whitespaces(\s), commas, and periods([,.])

In [4]:
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


#### Removing the whitespaces ###  This reduces memory and computation requirements. However keeping whitespaces can be useful if we train models that are sensitive to the exact structure of the text(eg Python code which is sensitive to indentation and spacing)

In [5]:
result = [item for item in result if item.strip()]

print (result)

['Hello', ',', 'world', '.', 'This', 'is', 'a', 'test', '.']


In [6]:
# Splitting all other characters like commas that can be present in the text
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!#$"()<>\']|--|\s)', text)      #### This line and
result = [item.strip() for item in result if item.strip()]  #### This line involves our tokenization scheme
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [7]:
# Strip whitespaces from each item and then filter out any empty strings.
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


##### Lets now tokenize our text pdf that we are using above "sustainable_development.pdf".

In [8]:
preprocessed = re.split(r'([,.:;?_!#$"()<>\']|--|\s)', raw_text)  # preprocessed is just a variable
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['Global', 'Rewilding', 'Initiative', ':', 'Rewilding', 'People', '&', 'Nature', 'Learn', 'how', 'one', 'person', ',', 'even', 'of', 'limited', 'means', ',', 'can', 'plant', 'a', 'forest', '.', 'Home', '›', 'Articles', '›', '17', 'Sustainable', 'Development']


In [9]:
print(len(preprocessed))

1615


###### Creating a list of all unique tokens and sorting them out alphabetically to determine the vocabulary size ######

In [10]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
 
print(vocab_size)

613


In [11]:
## Assign integer values(from zero)to all tokens(which include punctuation marks, numbers and alphabetical letters,in alphabetical order)
vocab = {token:integer for integer, token in enumerate(all_words)}

In [12]:
# printing first 51 entries for illustration purposes
for i , item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('$', 0)
('&', 1)
('(', 2)
(')', 3)
(',', 4)
('.', 5)
('//glorew', 6)
('1', 7)
('1/4GOAL', 8)
('10', 9)
('11', 10)
('11/24/24', 11)
('12', 12)
('13', 13)
('14', 14)
('15', 15)
('16', 16)
('17', 17)
('2', 18)
('2/4table', 19)
('2015', 20)
('2020', 21)
('2024', 22)
('2030', 23)
('25', 24)
('3', 25)
('3/4About', 26)
('35', 27)
('4', 28)
('4/4', 29)
('5', 30)
('500', 31)
('6', 32)
('7', 33)
('8', 34)
('9', 35)
(':', 36)
(';', 37)
('?', 38)
('A', 39)
('AM', 40)
('About', 41)
('Achieve', 42)
('Action', 43)
('Affordable', 44)
('Agenda', 45)
('Agriculture', 46)
('Always', 47)
('An', 48)
('Articles', 49)
('ArticlesArticles', 50)


##### As seen from the output above, the dictionary contains individual tokens with unique integer labels

#### Lets now implement a complete tokenizer class in python
###  This class will have both an encode and decode method
#### Encode method takes text as input and gives out token ids as output
#### Decode method takes token ids as input and gives out text as output

In [14]:
### Tokenizer class

class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()} # s token i token id

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!#$"()<>\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

#### Let's instantiate a new tokenizer object from the SimpleTokenizerV1 class and tokenize a passage  from the pdf uploaded 

In [15]:
tokenizer = SimpleTokenizerV1(vocab)

text = """Of the 17 goals, adopted by the
General Assembly of the United
Nations as Agenda for Sustainable
Development,"""
ids = tokenizer.encode(text) # Encode turns text to ids
print(ids)

[137, 556, 17, 355, 4, 226, 270, 556, 87, 51, 452, 556, 201, 132, 246, 45, 339, 189, 70, 4]


In [16]:
tokenizer.decode(ids) # Decode turns the ids into text

'Of the 17 goals, adopted by the General Assembly of the United Nations as Agenda for Sustainable Development,'

#### We implemented a tokenizer capable of tokenizing and de-tokenizing text based on a snippet from the training set.


Let's now apply it to a new text sample that is not contained in the training set

In [17]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

#### Hello was not used in our tutorial hence its un unknown word hence there is an exception...

##### Having large and diverse datasets helps cater for new words so there wont be exceptions of unknown words


 ##### SPECIAL CONTEXT TOKENS #####
This deals with the unknown words for a training model hence not to show an error message

##### We will modify the vocabulary and tokenizer we implemented in the previous section, SimpleTokenizerV2, to support 2 new tokens: <|unk|> and <|endoftext|> and they are assigned token ids respectively

##### For some new unknown words that were not in the vocaburaly, they will be assigned tokens of unknown and given token ids of unknown


##### When working with multiple text sources we add <|endoftext|> tokens between the texts signaling end and start of different segments

In [18]:
# We will add these to tokens to the list of all unique words that we created in the previous section
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [19]:
len(vocab.items())

615

#### length of vocabulary has increased from 613 to 615

#### For additional quick check, let's print last 5 entries of the updated vocabulary

In [20]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('“leaving', 610)
('…', 611)
('›', 612)
('<|endoftext|>', 613)
('<|unk|>', 614)


#### Let's extend the simple tokenizer class

In [21]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([],.:;?!"()\'])', r'\1', text)
        return text

In [22]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Most parasites  in tree planting,"
text2 = "only evaluate themselves."

text = "<|endoftext|>".join((text1, text2))

print(text)

Most parasites  in tree planting,<|endoftext|>only evaluate themselves.


In [23]:
tokenizer.encode(text)

[130, 614, 383, 568, 484, 4, 614, 320, 558, 5]