#### Get 'The Verdict' Text.

In [None]:
from typing import List

with open("data/the-verdict.txt", "r") as the_verdict_file:
  the_verdict_raw_text = the_verdict_file.read() # Read in text from 'The Verdict'

print(the_verdict_raw_text)

In [2]:
len(the_verdict_raw_text)

20479

In [2]:
# Import regular expressions
import re
print("Imported regular expressions!")

Imported regular expressions!


#### Tokenize 'The Verdict', removing white space, punctuation, and treating individual words as tokens.

In [3]:
# Remove all punctuation and white space
# Regular expression supplied by the video
the_verdict_tokens = re.split(r'([,.:;?_!"()\']|--|\s)', the_verdict_raw_text)
print(the_verdict_tokens)
the_verdict_tokens = [token for token in the_verdict_tokens if token.strip()]
print(the_verdict_tokens)

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', 'great', ' ', 'surprise', ' ', 'to', ' ', 'me', ' ', 'to', ' ', 'hear', ' ', 'that', ',', '', ' ', 'in', ' ', 'the', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', ',', '', ' ', 'he', ' ', 'had', ' ', 'dropped', ' ', 'his', ' ', 'painting', ',', '', ' ', 'married', ' ', 'a', ' ', 'rich', ' ', 'widow', ',', '', ' ', 'and', ' ', 'established', ' ', 'himself', ' ', 'in', ' ', 'a', ' ', 'villa', ' ', 'on', ' ', 'the', ' ', 'Riviera', '.', '', ' ', '', '(', 'Though', ' ', 'I', ' ', 'rather', ' ', 'thought', ' ', 'it', ' ', 'would', ' ', 'have', ' ', 'been', ' ', 'Rome', ' ', 'or', ' ', 'Florence', '.', '', ')', '', '\n', '', '\n', '', '"', 'The', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', '"', '', '--', 'that', ' ', 'was', ' ', '

#### Create a Vocabulary out the words in 'The Verdict'
#### Assign them each a Unique Token Id

In [4]:
all_words = sorted(set(the_verdict_tokens)) # Get only unique tokens, and sort them alphabetically
vocabulary_size = len(all_words)

# Go over every token and assign them a unique token id using enumerate
token_to_id_mapping = {token:token_id for token_id, token in enumerate(all_words)} 

#### Create your own version of the Version One Tokenizer
To Note:
* It can take in a sample text in \_\_init\_\_(...)
* It can then create a mapping dictionary from tokens to ids using the \_\_init\_\_(...) function
* For the function encode_text(...), it can use the token to ids mapping dictionary to take in a string containing a sentence, and then output a list of integer ids
* For the function decode_ids(...), it can use the token to ids mapping dictionary to take in a list of ids encoding a sentence, and then output the corresponding list of tokens. You can also specify a parameter to output the sentence as a string.

In [5]:
class TokenizerV1:
  """
  Version 1 of a class used to tokenize texts.

  Tokenization is the act of breaking down a text into significant individual units called 'tokens'.
  """
  def __init__(self, source_text: str):
    """
    Args:
      sample_text: str - Contains the source text we are going to use to create our vocabulary
    
    Creates:
      self.token_to_token_id_mapping: Dict[str, int] - Maps from token to token id
      self.token_id_to_token_mapping: Dict[int, str] - Maps from token id to token
    """
    # Split the text based on punctuation or whitespace.
    text_tokens_with_whitespace = re.split(r'([,.:;?_!"()\']|--|\s)', source_text)

    # Remove all white space tokens, leaving just word and punctuation as tokens.
    text_tokens = [token for token in text_tokens_with_whitespace if token.strip()]

    # Sort all unique tokens
    sorted_unique_tokens = sorted(set(text_tokens))
    
    self.token_to_token_id_mapping = {token:token_id for token_id, token in enumerate(sorted_unique_tokens)}
    self.token_id_to_token_mapping = {token_id:token for token,token_id in self.token_to_token_id_mapping.items()}
  
  def encode_tokens(self, list_of_tokens: List[str]) -> List[int]:
    """
    Returns a new list of token ids that correspond to the input list of tokens.

    Args:
      list_of_tokens: List[str] - List of tokens
    
    Returns:
      list_of_token_ids: List[int] - The corresponding list of token ids to the list of tokens
    """
    list_of_token_ids = []
    for token in list_of_tokens:
      if token not in self.token_to_token_id_mapping:
        list_of_token_ids.append(-1)
      else:
        list_of_token_ids.append(self.token_to_token_id_mapping[token])
    
    return list_of_token_ids

  def decode_token_ids(self, list_of_token_ids: List[int]) -> List[str]:
    """
    Returns a new list of tokens that correspond to the input list of token ids.

    Args:
      list_of_token_ids: List[int] - List of token ids
    
    Returns:
      list_of_tokens: List[str] - The corresponding list of tokens to the list of token ids
    """
    list_of_tokens = []
    for token_id in list_of_token_ids:
      if token_id not in self.token_id_to_token_mapping:
        list_of_tokens.append("<unknown>")
      else:
        list_of_tokens.append(self.token_id_to_token_mapping[token_id])
    
    return list_of_tokens
    

In [8]:
# Client file for the TokenizerV1 Class

tokenizer = TokenizerV1(the_verdict_raw_text)
origin_text_tokenized = the_verdict_tokens[:17]
token_ids = tokenizer.encode_tokens(origin_text_tokenized[:17])
print(origin_text_tokenized)
print(token_ids)
token_ids_converted_to_tokens = tokenizer.decode_token_ids(token_ids)
print(token_ids_converted_to_tokens)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--']
[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6]
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--']


In [None]:
# Exercise: Create byte pair algorithm. This will occur later in the video
# Maybe let's not do this? It's not hard to grasp, but a decent amount of work.
# Let's stick to more LLM-centered stuff.

#### Use Byte Pair Encoding Through A Third Party Library
##### Through Byte Pair Encoding, if you're given an unknown word, the algorithm will be able to find and tokenize known words that may occur within the larger unknown word. All other unkown words will be broken down and tokenized into their individual characters.

For example, 'asfartestdfsdjumpsiwe' will get tokenized as ['as', 'far', 'test', 'd', 'f', 's', 'd', 'jump', 's', 'i', 'w', 'e']

In [22]:
import tiktoken

gpt2_tokenizer = tiktoken.get_encoding("gpt2") # gets the gpt2 tokenizer
encoded = gpt2_tokenizer.encode("parrot")
print(encoded)
print(gpt2_tokenizer.decode(encoded))

[1845, 10599]
parrot


0.9.0
