# This is the notebook for chapter 2 - Tokenization part

In [1]:
print("hello world")

hello world


### preprocess the raw text to individual words (including punctuations)
after preprocessing the whole raw text file becomes individual tokens.

In [2]:
# we should have the the-verdict.txt file ready in local env.

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# inspect the length of words and print a sample
print(len(raw_text))
print(raw_text[:99])

20480
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
# sample code to use regular expression to tokenize an input text stream
import re
sample_text = "Hello, word. This, is a test."

# this way we are splitting based on spaces, not ideal because there are punctuation characters attached to words
result = re.split(r'(\s)', sample_text)
print(result)

# this way we are splitting on whitespaces (\s), commas, and periods, it's still not ideal because an empty string or a whitespace is an element
result = re.split(r'([,.]|\s)', sample_text)
print(result)
# we can get rid of spaces with:
result = [item for item in result if item.strip()]
print(result)

# for our short story text we want to also include text like "--" when we do tokenization, so we can:
sample_text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', sample_text)
result = [item for item in result if item.strip()]
print(result)

['Hello,', ' ', 'word.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']
['Hello', ',', '', ' ', 'word', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']
['Hello', ',', 'word', '.', 'This', ',', 'is', 'a', 'test', '.']
['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [4]:
# let's use this RE scheme on the input text
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(len(preprocessed))

# let's inspect 30 elements, looks pretty good
print(preprocessed[:30])

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


### Convert tokens into token IDs
tokens are still in String type, now we need to map tokens into integers that can be further processed by Python.

steps to converting to token IDs:
1. We first get a set of unique words from tokens.
2. We sort the set alphabetically, and label them from 0 to N (N is the number of unique tokens)
3. With the labels we map words into integer token IDs.


In [7]:
all_unique_words = sorted(set(preprocessed))
vocab_size = len(all_unique_words)
print("total unique tokens:", vocab_size)

# let's check some token IDs
vocab = {token: integer for integer, token in enumerate(all_unique_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

total unique tokens: 1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)
