<a href="https://colab.research.google.com/github/DataSavvyYT/AI-engineering-course/blob/main/01_llm_tokens/03_llm_tokens_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. Install necessary library ---
# tiktoken is the official BPE tokenizer for OpenAI models (GPT-3.5, GPT-4, etc.)
!pip install tiktoken -q

In [None]:
# --- 2. Import Libraries ---
import tiktoken

In [None]:
import tiktoken

def visualize_tokens(text, model_encoding="cl100k_base"):
    """
    Splits text into tokens using tiktoken and displays the string representation
    of each token, rather than the integer ID.
    """
    # Load the encoding used by GPT-4 and GPT-3.5
    encoding = tiktoken.get_encoding(model_encoding)

    # Tiktoken requires us to encode to integers first to find boundaries
    token_integers = encoding.encode(text)

    # Convert those integers back to their individual string/byte representations
    # to visualize the actual "tokens"
    token_strings = [
        encoding.decode_single_token_bytes(token).decode("utf-8", errors="replace")
        for token in token_integers
    ]

    print(f"--- Tokenization Report for: '{model_encoding}' ---")
    print(f"Original Text:  {text}\n")
    print(f"Token Count:    {len(token_strings)}")
    print(f"Token List:     {token_strings}")
    print("-" * 50)

    # detailed view showing boundaries clearly
    print("\nDetailed Boundary View:")
    for i, token in enumerate(token_strings):
        print(f"Token {i+1}: '{token}'")

# --- Example Usage ---
sample_text = "Tokenization is fascinating!"
visualize_tokens(sample_text)

--- Tokenization Report for: 'cl100k_base' ---
Original Text:  Tokenization is fascinating!

Token Count:    5
Token List:     ['Token', 'ization', ' is', ' fascinating', '!']
--------------------------------------------------

Detailed Boundary View:
Token 1: 'Token'
Token 2: 'ization'
Token 3: ' is'
Token 4: ' fascinating'
Token 5: '!'


In [None]:
# --- 4. Define Input Text Examples ---
# Example 1: Standard English sentence
text_1 = "The tokenization demonstration with tiktoken is now complete."

# Example 2: Numbers, symbols, and a very long/rare word
# LLMs are good at splitting these into byte-level subwords.
text_2 = "Hello $1,000,000! Let's analyze the 'unpredictability' of the process."