<a href="https://colab.research.google.com/github/BheezPen/DSN_NLP_MATERIAL_TRAINING/blob/main/COLABipynb_files/02_Tokenization_in_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📌  Tokenization in NLP - Beginner to Advanced Guide

In [None]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import AutoTokenizer
import re

# Ensure required datasets are downloaded
nltk.download('punkt')
nltk.download('punkt_tab') # Download the 'punkt_tab' resource

### 1️⃣ Introduction to Tokenization
"""
Tokenization is the process of breaking text into meaningful units (tokens).
There are different types of tokenization:
  - Word Tokenization: Splitting text into words.
  - Sentence Tokenization: Splitting text into sentences.
  - Subword Tokenization: Used in deep learning (BPE, WordPiece).
"""

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


'\nTokenization is the process of breaking text into meaningful units (tokens).\nThere are different types of tokenization:\n  - Word Tokenization: Splitting text into words.\n  - Sentence Tokenization: Splitting text into sentences.\n  - Subword Tokenization: Used in deep learning (BPE, WordPiece).\n'

In [None]:
import nltk
nltk.download('punkt')  # Download sentence tokenizer data
#nltk.download('punkt_tab') # Download the 'punkt_tab' resource
from nltk.tokenize import word_tokenize

text = "The cat sat on the mat. The dog barked."

word = word_tokenize(text)

print("Word:", word)

Word: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.', 'The', 'dog', 'barked', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Sample text for tokenization
txt = "Hello everyone! Welcome to the world of NLP. Today, we'll learn tokenization."

### 2️⃣ Word Tokenization
word_tokens = word_tokenize(txt)
print("Word Tokens:", word_tokens)

Word Tokens: ['Hello', 'everyone', '!', 'Welcome', 'to', 'the', 'world', 'of', 'NLP', '.', 'Today', ',', 'we', "'ll", 'learn', 'tokenization', '.']


In [None]:
### 3️⃣ Sentence Tokenization
sentence_tokens = sent_tokenize(txt)
print("Sentence Tokens:", sentence_tokens)

Sentence Tokens: ['Hello everyone!', 'Welcome to the world of NLP.', "Today, we'll learn tokenization."]


In [None]:
### 4️⃣ Regex-Based Tokenization
# Custom tokenization using regular expressions
regex_tokens = re.findall(r'\b\w+\b', txt)
print("Regex Tokens:", regex_tokens)

Regex Tokens: ['Hello', 'everyone', 'Welcome', 'to', 'the', 'world', 'of', 'NLP', 'Today', 'we', 'll', 'learn', 'tokenization']


In [None]:
### 5️⃣ Subword Tokenization (Byte Pair Encoding - BPE)



# Subword Tokenization (Byte Pair Encoding - BPE)
# Example using subword tokenization with Hugging Face tokenizers
# Using a transformer-based tokenizer
import nltk
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # You can use any other model
text = "I have a new unbreakable toy."
tokens = tokenizer.tokenize(text)
print(tokens)

['i', 'have', 'a', 'new', 'un', '##break', '##able', 'toy', '.']


In [None]:
"""
Summary:
✔️ Word Tokenization - Breaks text into words
✔️ Sentence Tokenization - Breaks text into sentences
✔️ Regex Tokenization - Custom splitting with regex
✔️ Subword Tokenization - Used in deep learning models like BERT
"""
