# 📌  Tokenization in NLP - Beginner to Advanced Guide

In [None]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import AutoTokenizer
import re

# Ensure required datasets are downloaded
nltk.download('punkt')
nltk.download('punkt_tab') # Download the 'punkt_tab' resource

### 1️⃣ Introduction to Tokenization
"""
Tokenization is the process of breaking text into meaningful units (tokens).
There are different types of tokenization:
  - Word Tokenization: Splitting text into words.
  - Sentence Tokenization: Splitting text into sentences.
  - Subword Tokenization: Used in deep learning (BPE, WordPiece).
"""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


'\nTokenization is the process of breaking text into meaningful units (tokens).\nThere are different types of tokenization:\n  - Word Tokenization: Splitting text into words.\n  - Sentence Tokenization: Splitting text into sentences.\n  - Subword Tokenization: Used in deep learning (BPE, WordPiece).\n'

In [None]:
# Sample text for tokenization
txt = "Hello everyone! Welcome to the world of NLP. Today, we'll learn tokenization."

### 2️⃣ Word Tokenization
word_tokens = word_tokenize(txt)
print("Word Tokens:", word_tokens)

Word Tokens: ['Hello', 'everyone', '!', 'Welcome', 'to', 'the', 'world', 'of', 'NLP', '.', 'Today', ',', 'we', "'ll", 'learn', 'tokenization', '.']


In [None]:
### 3️⃣ Sentence Tokenization
sentence_tokens = sent_tokenize(txt)
print("Sentence Tokens:", sentence_tokens)

Sentence Tokens: ['Hello everyone!', 'Welcome to the world of NLP.', "Today, we'll learn tokenization."]


In [None]:
### 4️⃣ Regex-Based Tokenization
# Custom tokenization using regular expressions
regex_tokens = re.findall(r'\b\w+\b', txt)
print("Regex Tokens:", regex_tokens)

Regex Tokens: ['Hello', 'everyone', 'Welcome', 'to', 'the', 'world', 'of', 'NLP', 'Today', 'we', 'll', 'learn', 'tokenization']


In [None]:
### 5️⃣ Subword Tokenization (Byte Pair Encoding - BPE)
# Using a transformer-based tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(txt)
print("BERT Tokenization:", tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BERT Tokenization: ['hello', 'everyone', '!', 'welcome', 'to', 'the', 'world', 'of', 'nl', '##p', '.', 'today', ',', 'we', "'", 'll', 'learn', 'token', '##ization', '.']


In [None]:
"""
Summary:
✔️ Word Tokenization - Breaks text into words
✔️ Sentence Tokenization - Breaks text into sentences
✔️ Regex Tokenization - Custom splitting with regex
✔️ Subword Tokenization - Used in deep learning models like BERT
"""
