# Implementing Tokenization

Tokenizers play a pivotal role in natural language processing, segmenting text into smaller units known as tokens. These tokens are subsequently transformed into numerical representations called token indices, which are directly employed by deep learning algorithms.

In [None]:
# Install Necessary Libraries and Download Models

print("--- Installing Libraries ---")

# Install NLTK (often used for basic NLP tasks)
!pip install -qqq nltk

# Install Hugging Face Transformers
# Use -qqq for quiet output to keep the log clean
!pip install -qqq transformers

# Install SentencePiece for subword tokenization (often a dependency of models)
!pip install -qqq sentencepiece

# Install spaCy for advanced NLP tasks
!pip install -qqq spacy

# Download spaCy language models
# These downloads are often prompted to require a runtime restart.
print("\n--- Downloading spaCy Models ---")
!python -m spacy download en_core_web_sm -qqq
!python -m spacy download de_core_news_sm -qqq

print("\n--- Installation and Downloads Complete ---")
print("If prompted by spaCy, please RESTART THE RUNTIME (Runtime -> Restart runtime) now.")
print("This is crucial for spaCy models to load correctly.")

--- Installing Libraries ---

--- Downloading spaCy Models ---
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Res

In [None]:
# @title 2. Import Libraries and NLTK Data (Run AFTER restarting session if prompted by Install Cell)
# Run this cell AFTER you have restarted the session if the *installation cell* prompted you to.

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
from transformers import AutoTokenizer # We'll use AutoTokenizer for transformers

# --- Explicit NLTK Data Download ---
# Ensure 'punkt_tab' is downloaded. This is the newer, required resource for NLTK tokenizers.
print("--- Ensuring NLTK 'punkt_tab' tokenizer is downloaded ---")
try:
    nltk.data.find('tokenizers/punkt_tab') # Check for punkt_tab specifically
    print("NLTK 'punkt_tab' tokenizer already present.")
except LookupError:
    print("Downloading NLTK 'punkt_tab' tokenizer...")
    nltk.download('punkt_tab', quiet=True) # Download punkt_tab
    print("NLTK 'punkt_tab' tokenizer downloaded successfully.")
# --- End Explicit NLTK Data Download ---


print("\nAll necessary libraries imported and NLTK data checked.")

# Load spaCy models
try:
    nlp_en = spacy.load("en_core_web_sm")
    nlp_de = spacy.load("de_core_news_sm")
    print("spaCy 'en_core_web_sm' and 'de_core_news_sm' models loaded.")
except OSError:
    print("Error loading spaCy models. Did you restart the session after downloading them?")
    print("Please restart the session (Runtime -> Restart session) and re-run this cell.")

--- Ensuring NLTK 'punkt_tab' tokenizer is downloaded ---
Downloading NLTK 'punkt_tab' tokenizer...
NLTK 'punkt_tab' tokenizer downloaded successfully.

All necessary libraries imported and NLTK data checked.
spaCy 'en_core_web_sm' and 'de_core_news_sm' models loaded.


In [None]:
# Example Text for Tokenization
# A sample text to demonstrate different tokenization methods.

english_text = "Natural language processing (NLP) is a field of artificial intelligence. It focuses on the interaction between computers and human language."
german_text = "Natürliche Sprachverarbeitung (NLP) ist ein Bereich der künstlichen Intelligenz. Sie konzentriert sich auf die Interaktion zwischen Computern und menschlicher Sprache."

print("Example texts defined.")

Example texts defined.


In [None]:
# Tokenization Examples

print("--- NLTK Tokenization ---")
# NLTK: Word Tokenization
nltk_words = word_tokenize(english_text)
print(f"NLTK Word Tokens (English): {nltk_words[:10]}...") # Show first 10 for brevity

# NLTK: Sentence Tokenization
nltk_sentences = sent_tokenize(english_text)
print(f"NLTK Sentence Tokens (English): {nltk_sentences}")

# NLTK: Character-based (simple manual implementation for demonstration)
# NLTK doesn't have a built-in character tokenizer as a primary function,
# but it's easy to do manually.
char_tokens_english = list(english_text)
print(f"Character Tokens (English - manual): {char_tokens_english[:20]}...") # Show first 20


print("\n--- spaCy Tokenization ---")
# spaCy: Word and Sentence Tokenization (more advanced, handles punctuation, etc.)
doc_en = nlp_en(english_text)
spacy_words_en = [token.text for token in doc_en]
print(f"spaCy Word Tokens (English): {spacy_words_en[:10]}...")

spacy_sentences_en = [sent.text for sent in doc_en.sents]
print(f"spaCy Sentence Tokens (English): {spacy_sentences_en}")

# spaCy for German
doc_de = nlp_de(german_text)
spacy_words_de = [token.text for token in doc_de]
print(f"spaCy Word Tokens (German): {spacy_words_de[:10]}...")


print("\n--- Hugging Face Transformers Tokenization (Subword) ---")
# Hugging Face Transformers: Subword Tokenization (e.g., using BERT's tokenizer)
# We'll use a pre-trained tokenizer, which typically uses a subword algorithm
# like WordPiece (for BERT) or SentencePiece (for ALBERT, XLNet, etc.)

# Load a tokenizer for a common model (e.g., 'bert-base-uncased')
# 'uncased' means it converts text to lowercase before tokenizing
tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased")
hf_tokens_bert = tokenizer_bert.tokenize(english_text)
print(f"Hugging Face (BERT) Subword Tokens (English): {hf_tokens_bert[:15]}...") # Show first 15

# Example with token IDs and special tokens
encoded_input = tokenizer_bert(english_text, return_tensors='pt', add_special_tokens=True)
print(f"Hugging Face (BERT) Encoded Input (with special tokens):")
print(f"  Input IDs: {encoded_input['input_ids'][0]}")
print(f"  Decoded: {tokenizer_bert.decode(encoded_input['input_ids'][0])}")

# Another example with a different subword tokenizer (e.g., ALBERT uses SentencePiece)
tokenizer_albert = AutoTokenizer.from_pretrained("albert-base-v2")
hf_tokens_albert = tokenizer_albert.tokenize(english_text)
print(f"Hugging Face (ALBERT) Subword Tokens (English): {hf_tokens_albert[:15]}...")


print("\n--- Comparison of Tokenization Methods ---")
print(f"Original English Text Length: {len(english_text)} characters")
print(f"NLTK Word Tokens Count: {len(nltk_words)}")
print(f"Character Tokens Count: {len(char_tokens_english)}")
print(f"Hugging Face (BERT) Subword Tokens Count: {len(hf_tokens_bert)}")

print("\nNotice how subword tokenization often breaks down words like 'processing' into 'process' and '##ing'.")
print("This helps handle out-of-vocabulary words and reduces vocabulary size while retaining semantic information.")


--- NLTK Tokenization ---
NLTK Word Tokens (English): ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of']...
NLTK Sentence Tokens (English): ['Natural language processing (NLP) is a field of artificial intelligence.', 'It focuses on the interaction between computers and human language.']
Character Tokens (English - manual): ['N', 'a', 't', 'u', 'r', 'a', 'l', ' ', 'l', 'a', 'n', 'g', 'u', 'a', 'g', 'e', ' ', 'p', 'r', 'o']...

--- spaCy Tokenization ---
spaCy Word Tokens (English): ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of']...
spaCy Sentence Tokens (English): ['Natural language processing (NLP) is a field of artificial intelligence.', 'It focuses on the interaction between computers and human language.']
spaCy Word Tokens (German): ['Natürliche', 'Sprachverarbeitung', '(', 'NLP', ')', 'ist', 'ein', 'Bereich', 'der', 'künstlichen']...

--- Hugging Face Transformers Tokenization (Subword) ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hugging Face (BERT) Subword Tokens (English): ['natural', 'language', 'processing', '(', 'nl', '##p', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', '.', 'it']...
Hugging Face (BERT) Encoded Input (with special tokens):
  Input IDs: tensor([  101,  3019,  2653,  6364,  1006, 17953,  2361,  1007,  2003,  1037,
         2492,  1997,  7976,  4454,  1012,  2009,  7679,  2006,  1996,  8290,
         2090,  7588,  1998,  2529,  2653,  1012,   102])
  Decoded: [CLS] natural language processing ( nlp ) is a field of artificial intelligence. it focuses on the interaction between computers and human language. [SEP]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Hugging Face (ALBERT) Subword Tokens (English): ['▁natural', '▁language', '▁processing', '▁', '(', 'n', 'lp', ')', '▁is', '▁a', '▁field', '▁of', '▁artificial', '▁intelligence', '.']...

--- Comparison of Tokenization Methods ---
Original English Text Length: 140 characters
NLTK Word Tokens Count: 24
Character Tokens Count: 140
Hugging Face (BERT) Subword Tokens Count: 25

Notice how subword tokenization often breaks down words like 'processing' into 'process' and '##ing'.
This helps handle out-of-vocabulary words and reduces vocabulary size while retaining semantic information.
