# Short Story as text sample Into python

**Step 1 : Create Tokens**

In [12]:
with open("/content/sample_data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print('Number of characters : ', len(raw_text))
# print first 100 characters
print(raw_text[:99])

Number of characters :  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


- Tokenize the whole characters (20479) into individual words and special characters Then turn into embeddings

- Split text into list of tokens based on white text or special characters ...

In [13]:
import re # Regular expression

text_test = "Hello, world. this, is a test!"
result = re.split(r'([,.!]|\s)', text_test)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'this', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '!', '']


- Remove redundants characters safely

In [14]:
#result = [item for item in result if item not in ['', ' ']]
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'this', ',', 'is', 'a', 'test', '!']


- In our context we are removing the white-spaces because our text structure doesnt need it (working on simple sample of text)

- Full process

In [15]:
text_test = 'I HAD always! thought Jack Gisburn rather? a cheap genius--though a good fellow enough--so it was no! '
# Tokenization sheme
result = re.split(r'([.,:;!_?"]|--|\s)', text_test)
result = [i for i in result if i.strip()]
print(result)

['I', 'HAD', 'always', '!', 'thought', 'Jack', 'Gisburn', 'rather', '?', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', '!']


- Apply the tokenizer on the Story

In [16]:
preprocessed = re.split(r'([,.:;_!?"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:20])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']


In [17]:
print('Total number of tokens : ', len(preprocessed))

Total number of tokens :  4690


**Step 2 : Create Tokens IDs**

- Create list of tokens and sort them alphabetically to determine the Vocabilary size

In [18]:
all_words = sorted(set(preprocessed))
print(all_words[:15])
print(len(all_words))

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And']
1130


- Create the Vocabulary itself

In [19]:
vocab = {token : i for i, token in enumerate(all_words)}


In [20]:
for i, v in enumerate(vocab.items()):
  if i > 25:
    break
  print(v)

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)


- Emplement Tokenizer class

In [21]:
class SimpleTokenizer:
  def __init__(self):
    self.str_to_int = vocab
    self.int_to_str = {id:t for t, id in vocab.items()}

  def encoder(self, txt):
    processed = re.split(r'([.,;?()!_:\'"]|--|\s)', txt)

    processed = [item.strip() for item in processed if item.strip()]

    ids = [self.str_to_int[t] for t in processed]
    return ids

  def decoder(self, ids):
    txt = " ".join([self.int_to_str[id] for id in ids])
    # Prevent the whitespace before the punctuation marks
    txt = re.sub(r'\s+([,."\'?!()])', r'\1', txt)
    txt = re.sub(r'(["\'])\s+(\w+)', r'\1\2', txt)
    return txt

token = SimpleTokenizer()
print(token.encoder('Ah! At "Among" !'))
print(token.decoder(token.encoder('Ah! \'At\':"Among " !')))


[12, 0, 18, 1, 13, 1, 0]
Ah!'At' :"Among"!


- To address words not in the vocabulary, the vocabulary needs to be extended.

**Adding Special Context Tokens**

- Modify the Tokenizer to handle unkonwn words (<|unk|> <|endoftext|>)

In [26]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|unk|>', '<|endoftext|>'])

vocab = {t:i for i, t in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [27]:
for item in list(vocab.items())[-5:]:
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|unk|>', 1130)
('<|endoftext|>', 1131)


- Implementing SimpleTokenizerV2

In [43]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:t for t,i in vocab.items()}

  def encoder(self, txt):
    # Split Text into tokens
    preprocessed = re.split(r'([.,;?()!_:\'"]|--|\s)', txt)
    # Clear the Tokens from whitespaces
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    # Identifiy the unknown words
    preprocessed = [
        item if item in self.str_to_int else '<|unk|>'
        for item in preprocessed
    ]
    # Map the tokens with thier ids
    ids = [self.str_to_int[s] for s in preprocessed]

    return ids

  def decoder(self, ids):
    txt = " ".join([self.int_to_str[i] for i in ids])
    txt = re.sub(r'\s+([,.;:!?()"\'])', r'\1', txt)

    return txt

In [44]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = 'Salam! I love tea so much. What about you?'
text2 = 'Yes! i love tea and coffee.'
text = " <|endoftext|> ".join((text1, text2))

print(text)

Salam! I love tea so much. What about you? <|endoftext|> Yes! i love tea and coffee.


In [45]:
print(tokenizer.encoder(text))
print(tokenizer.decoder(tokenizer.encoder(text)))

[1130, 0, 53, 1130, 975, 908, 691, 7, 109, 118, 1126, 10, 1131, 112, 0, 1130, 1130, 975, 157, 1130, 7]
<|unk|>! I <|unk|> tea so much. What about you? <|endoftext|> Yes! <|unk|> <|unk|> tea and <|unk|>.


- More Special tokens : [BOS] [EOS] [PAD]

- For GPT models they doesnt use <|unk|> tokens. Instead they uses a tokenizer called **Byte Pair Encoding** which beaks words into subword units.

**Byte Pair Encoding**

- Use pyhton open-source library '**tiktoken**' 'https://github.com/openai/tiktoken'

In [4]:
#!pip3 install tiktoken

In [3]:
# Import the tiktoken package and check its version
import importlib
import tiktoken

print('Tiktoken version : ', importlib.metadata.version("tiktoken"))

Tiktoken version :  0.12.0


In [None]:
# Instantiate BPE tokenizer

In [7]:
tokenizer = tiktoken.get_encoding("gpt2")

In [13]:
text = ("Hello! do you like tea? <|endoftext|> The sun rose quickly this morning."
"what do you thinkofme me?")

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)
print(tokenizer.decode([1326, 502]))  # Should show "me"

[15496, 0, 466, 345, 588, 8887, 30, 220, 50256, 383, 4252, 8278, 2952, 428, 3329, 13, 10919, 466, 345, 892, 1659, 1326, 502, 30]
me me


In [14]:
text = tokenizer.decode(ids)

print(text)

Hello! do you like tea? <|endoftext|> The sun rose quickly this morning.what do you thinkofme me?
