# 2.2
I am using "The Pit and the Pendulum" by Edgar Allan Poe (which is in the public domain) for text preprocessing to get some different results from using "The Verdict" by Edith Wharton.

In [111]:
with open("pit-and-pendulum.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print(f"total number of character: {len(raw_text)}")
print(raw_text[:99])

total number of character: 33679
I was sick—sick unto death with that long agony; and when they at length unbound me, and I was perm


In [112]:
# Split a text on whitespace characters
import re
text = "Welcome, everyone. My name, is Cheng Guo."
result = re.split(r'(\s)', text)
print(result)

['Welcome,', ' ', 'everyone.', ' ', 'My', ' ', 'name,', ' ', 'is', ' ', 'Cheng', ' ', 'Guo.']


In [113]:
# Split on whitespaces, commas, and periods
result = re.split(r'([,.]|\s)', text)
print(result)

['Welcome', ',', '', ' ', 'everyone', '.', '', ' ', 'My', ' ', 'name', ',', '', ' ', 'is', ' ', 'Cheng', ' ', 'Guo', '.', '']


In [114]:
# Remove redundant whitespace characters
result = [item for item in result if item.strip()]
print(result)

['Welcome', ',', 'everyone', '.', 'My', 'name', ',', 'is', 'Cheng', 'Guo', '.']


In [115]:
# Adjust to handle other types of punctuation
text = "Wow, everyone! Am I Cheng--Guo?"
result = re.split(r'([,.:;?_!"()\']|--|—|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Wow', ',', 'everyone', '!', 'Am', 'I', 'Cheng', '--', 'Guo', '?']


In [116]:
# PRocessing the short story
preprocessed = re.split(r'([,.:;?_!"()\']|--|—|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

7061
['I', 'was', 'sick', '—', 'sick', 'unto', 'death', 'with', 'that', 'long', 'agony', ';', 'and', 'when', 'they', 'at', 'length', 'unbound', 'me', ',', 'and', 'I', 'was', 'permitted', 'to', 'sit', ',', 'I', 'felt', 'that']


# 2.3

In [117]:
# List and sort all unique tokens
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1680


In [118]:
# Creating a vocabulary
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('.', 6)
(':', 7)
(';', 8)
('?', 9)
('A', 10)
('After', 11)
('Agitation', 12)
('All', 13)
('Amid', 14)
('An', 15)
('And', 16)
('Another', 17)
('Arousing', 18)
('As', 19)
('At', 20)
('Avoiding', 21)
('But', 22)
('By', 23)
('Could', 24)
('Days', 25)
('Death', 26)
('Demon', 27)
('Down', 28)
('Dreading', 29)
('During', 30)
('Else', 31)
('Even', 32)
('Fate', 33)
('Fool', 34)
('For', 35)
('Forth', 36)
('Free', 37)
('French', 38)
('From', 39)
('General', 40)
('Groping', 41)
('Had', 42)
('Hades', 43)
('Having', 44)
('He', 45)
('How', 46)
('I', 47)
('In', 48)
('Inch', 49)
('Inquisition', 50)


In [119]:
# Implementing a simple text tokenizer
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab # stores vocab
        self.int_to_str = {i:s for s, i in vocab.items()} # creates the inverse vocab
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|—|\s)', text) # processes input text into IDs
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|—|\s)', r'\1', text) # removes spaces before the specified punctuation
        return text

In [120]:
# Try the tokenizer
tokenizer = SimpleTokenizerV1(vocab)
text = """I saw that some ten or twelve vibrations would bring the steel 
            in actual contact with my robe—and with this observation there suddenly 
            came over my spirit all the keen, collected calmness of despair."""
ids = tokenizer.encode(text)
print(ids)

[47, 1259, 1464, 1346, 1455, 1043, 1542, 1600, 1668, 269, 1465, 1377, 803, 118, 366, 1656, 979, 1237, 1679, 151, 1656, 1479, 1017, 1470, 1413, 290, 1054, 979, 1361, 136, 1465, 848, 5, 342, 289, 1030, 439, 6]


In [121]:
# Convert back
print(tokenizer.decode(ids))

I saw that some ten or twelve vibrations would bring the steel in actual contact with my robe— and with this observation there suddenly came over my spirit all the keen, collected calmness of despair.


In [122]:
# New text (modified code)
try:
    text = "Hello, do you like tea?"
    print(tokenizer.encode(text))
except Exception as e:
    print(f"This error is a {type(e).__name__} on the word {str(e)}")

This error is a KeyError on the word 'Hello'


# 2.4

In [123]:
# Add special tokens
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1682


In [124]:
# Quick check
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('yawning', 1677)
('yet', 1678)
('—', 1679)
('<|endoftext|>', 1680)
('<|unk|>', 1681)


In [125]:
# New text tokenizer
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab # stores vocab
        self.int_to_str = {i:s for s, i in vocab.items()} # creates the inverse vocab
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|—|\s)', text) # processes input text into IDs
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?_!"()\']|--|—|\s)', r'\1', text) # removes spaces before the specified punctuation
        return text

In [126]:
# Try the new tokenizer
text1 = "Hello, do you like tea?"
text2 = "This is a story about pit and pendulum."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> This is a story about pit and pendulum.


In [127]:
# Tokenize the sample text
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1681, 5, 474, 1681, 881, 1681, 9, 1680, 85, 837, 103, 1681, 104, 1099, 151, 1077, 6]


In [128]:
# Detokenize
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do <|unk|> like <|unk|>? <|endoftext|> This is a <|unk|> about pit and pendulum.
