In [1]:
# 1. Setup: Importing libraries
import re
import nltk
from nltk.tokenize import WhitespaceTokenizer, word_tokenize

# Download the 'punkt' package which NLTK needs for its advanced tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

# 2. Define a sample text
# Since you're interested in Power BI and AI, let's use a relevant example
sample_text = "AI is transforming Data Analysis. Don't forget your Oracle exam!"

print("-" * 30)
print("ORIGINAL TEXT:")
print(sample_text)
print("-" * 30)

# --- METHOD A: Basic Python Whitespace Split ---
# This is the simplest form. It splits wherever it finds a space.
method_a = sample_text.split()
print("\n1. Python .split() (Basic Whitespace):")
print(method_a)

# --- METHOD B: NLTK Whitespace Tokenizer ---
# Specifically looks for whitespace characters (tabs, newlines, spaces).
ws_tokenizer = WhitespaceTokenizer()
method_b = ws_tokenizer.tokenize(sample_text)
print("\n2. NLTK WhitespaceTokenizer:")
print(method_b)

# --- METHOD C: NLTK Word Tokenizer (The "Smarter" way) ---
# This is what most ML practitioners use because it separates punctuation.
method_c = word_tokenize(sample_text)
print("\n3. NLTK Word Tokenizer (Handles punctuation):")
print(method_c)

# 4. A quick look at "Subword Tokenization" (The logic behind LLMs)
# This isn't whitespace-based, but it's how modern AI sees text
print("\nNote: Notice how in methods 1 & 2, 'Analysis.' includes the period.")
print("In method 3, the period is its own token. This is better for ML models!")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


------------------------------
ORIGINAL TEXT:
AI is transforming Data Analysis. Don't forget your Oracle exam!
------------------------------

1. Python .split() (Basic Whitespace):
['AI', 'is', 'transforming', 'Data', 'Analysis.', "Don't", 'forget', 'your', 'Oracle', 'exam!']

2. NLTK WhitespaceTokenizer:
['AI', 'is', 'transforming', 'Data', 'Analysis.', "Don't", 'forget', 'your', 'Oracle', 'exam!']

3. NLTK Word Tokenizer (Handles punctuation):
['AI', 'is', 'transforming', 'Data', 'Analysis', '.', 'Do', "n't", 'forget', 'your', 'Oracle', 'exam', '!']

Note: Notice how in methods 1 & 2, 'Analysis.' includes the period.
In method 3, the period is its own token. This is better for ML models!


In [3]:

import pandas as pd

tokens = tokenizer.tokenize(sentences[0])
ids = tokenizer.convert_tokens_to_ids(tokens)

df = pd.DataFrame({"Token": tokens, "ID": ids})
print(df)# Step 1: Install the Transformers library
!pip install transformers -q

import torch
from transformers import AutoTokenizer

# Step 2: Initialize the Tokenizer
# We'll use 'bert-base-uncased' as it's the standard for learning tokenization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: Define your input text
raw_text = "Hugging Face is a giant in the AI community. Their tokenizers are incredibly fast!"

# --- SENTENCE-LEVEL HANDLING ---
# In modern NLP, we usually split text into a list of strings
# For high-precision sentence splitting, libraries like 'pysbd' or 'nltk' are used.
# Here is a clean way to handle multiple sentences:
sentences = [s.strip() + "." for s in raw_text.split(".") if s]

print(f"--- Step 1: Sentence Tokenization ---")
for i, s in enumerate(sentences):
    print(f"Sentence {i+1}: {s}")

# --- WORD/SUBWORD TOKENIZATION ---
print(f"\n--- Step 2: Subword Tokenization & Encoding ---")

# We can process all sentences at once (Batch Processing)
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

for i, sentence in enumerate(sentences):
    # Convert text to human-readable subword tokens
    tokens = tokenizer.tokenize(sentence)

    # Convert text to numerical IDs (what the model actually sees)
    input_ids = tokenizer.encode(sentence)

    print(f"\nResults for Sentence {i+1}:")
    print(f"Tokens:  {tokens}")
    print(f"IDs:     {input_ids}")

# --- DECODING ---
print(f"\n--- Step 3: Decoding ---")
decoded = tokenizer.decode(inputs['input_ids'][0])
print(f"Decoded first sentence: {decoded}")

       Token     ID
0    hugging  17662
1       face   2227
2         is   2003
3          a   1037
4      giant   5016
5         in   1999
6        the   1996
7         ai   9932
8  community   2451
9          .   1012
--- Step 1: Sentence Tokenization ---
Sentence 1: Hugging Face is a giant in the AI community.
Sentence 2: Their tokenizers are incredibly fast!.

--- Step 2: Subword Tokenization & Encoding ---

Results for Sentence 1:
Tokens:  ['hugging', 'face', 'is', 'a', 'giant', 'in', 'the', 'ai', 'community', '.']
IDs:     [101, 17662, 2227, 2003, 1037, 5016, 1999, 1996, 9932, 2451, 1012, 102]

Results for Sentence 2:
Tokens:  ['their', 'token', '##izer', '##s', 'are', 'incredibly', 'fast', '!', '.']
IDs:     [101, 2037, 19204, 17629, 2015, 2024, 11757, 3435, 999, 1012, 102]

--- Step 3: Decoding ---
Decoded first sentence: [CLS] hugging face is a giant in the ai community. [SEP]
