In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize, WordPunctTokenizer, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from textblob import TextBlob
import spacy
import gensim
from gensim.utils import tokenize
from keras.preprocessing.text import text_to_word_sequence

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Paragraph

In [None]:
para = "The sun ☀️ rose gracefully over the horizon, painting the sky in hues of pink and orange 🌅. Birds 🐦 chirped cheerfully as they welcomed the new day, while a gentle breeze 🍃 whispered through the trees 🌳. In the distance, the sound of waves 🌊 crashing against the shore added a soothing rhythm to the morning symphony. It was a perfect moment, filled with peace ✌️ and tranquility."

# Word Tokenisation


Word tokenization refers to the process of breaking down a text or a sentence into individual words or tokens. These tokens are the basic units of meaning in a language and are often used as input for various NLP tasks such as text classification, sentiment analysis, and machine translation.

In [None]:
tokens = word_tokenize(para)
print(tokens)

['The', 'sun', '☀️', 'rose', 'gracefully', 'over', 'the', 'horizon', ',', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', '🌅', '.', 'Birds', '🐦', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', ',', 'while', 'a', 'gentle', 'breeze', '🍃', 'whispered', 'through', 'the', 'trees', '🌳', '.', 'In', 'the', 'distance', ',', 'the', 'sound', 'of', 'waves', '🌊', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', '.', 'It', 'was', 'a', 'perfect', 'moment', ',', 'filled', 'with', 'peace', '✌️', 'and', 'tranquility', '.']


# Sentence Tokenization

Sentence tokenization involves splitting a text or a corpus into individual sentences. Used for text summarization, sentiment analysis, and machine translation, where understanding the meaning and structure of individual sentences is essential for accurate analysis and interpretation.



In [None]:
sentence = sent_tokenize(para)
print(sentence)

['The sun ☀️ rose gracefully over the horizon, painting the sky in hues of pink and orange 🌅.', 'Birds 🐦 chirped cheerfully as they welcomed the new day, while a gentle breeze 🍃 whispered through the trees 🌳.', 'In the distance, the sound of waves 🌊 crashing against the shore added a soothing rhythm to the morning symphony.', 'It was a perfect moment, filled with peace ✌️ and tranquility.']


# Punctuation-based Tokenizer


In Natural Language Processing (NLP), a Punctuation-based Tokenizer is a type of tokenizer specifically designed to break down text into tokens based on punctuation marks. This tokenizer splits text into tokens wherever it encounters punctuation characters such as periods, commas, question marks, exclamation marks, etc.

In [None]:
punkt_tokenizer = WordPunctTokenizer()
punkt_tokens = punkt_tokenizer.tokenize(para)
print(punkt_tokenizer)
print(punkt_tokens)

WordPunctTokenizer(pattern='\\w+|[^\\w\\s]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)
['The', 'sun', '☀️', 'rose', 'gracefully', 'over', 'the', 'horizon', ',', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', '🌅.', 'Birds', '🐦', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', ',', 'while', 'a', 'gentle', 'breeze', '🍃', 'whispered', 'through', 'the', 'trees', '🌳.', 'In', 'the', 'distance', ',', 'the', 'sound', 'of', 'waves', '🌊', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', '.', 'It', 'was', 'a', 'perfect', 'moment', ',', 'filled', 'with', 'peace', '✌️', 'and', 'tranquility', '.']


# Tweet Tokenizer
The Tweet Tokenizer is  for tokenizing tweets or short social media messages. Unlike traditional tokenizers which may split text based on spaces or punctuation alone, the Tweet Tokenizer takes into account the unique characteristics of tweets, such as hashtags, mentions, URLs, and emojis.

In [None]:
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(para)
print(tweet_tokenizer)
print(tweet_tokens)

<nltk.tokenize.casual.TweetTokenizer object at 0x7b5ee75ac4c0>
['The', 'sun', '☀', '️', 'rose', 'gracefully', 'over', 'the', 'horizon', ',', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', '🌅', '.', 'Birds', '🐦', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', ',', 'while', 'a', 'gentle', 'breeze', '🍃', 'whispered', 'through', 'the', 'trees', '🌳', '.', 'In', 'the', 'distance', ',', 'the', 'sound', 'of', 'waves', '🌊', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', '.', 'It', 'was', 'a', 'perfect', 'moment', ',', 'filled', 'with', 'peace', '✌', '️', 'and', 'tranquility', '.']


#Multi-Word Expression Tokenizer

Multi-Word Expression Tokenizer is a tool or algorithm designed to identify and tokenize multi-word expressions (MWEs) within a text. MWEs are sequences of words that often appear together and convey a specific meaning that may not be predictable from the individual words alone.

In [None]:
# Multi-Word Expression Tokenizer
multi_word_expr = [("and", "sun"), ("processing", "NLP")]
mwe_tokenizer = MWETokenizer(multi_word_expr)
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(para))
print(mwe_tokens)

['The', 'sun', '☀️', 'rose', 'gracefully', 'over', 'the', 'horizon', ',', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', '🌅', '.', 'Birds', '🐦', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', ',', 'while', 'a', 'gentle', 'breeze', '🍃', 'whispered', 'through', 'the', 'trees', '🌳', '.', 'In', 'the', 'distance', ',', 'the', 'sound', 'of', 'waves', '🌊', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', '.', 'It', 'was', 'a', 'perfect', 'moment', ',', 'filled', 'with', 'peace', '✌️', 'and', 'tranquility', '.']


#TextBlob Word Tokenize


TextBlob refers to the process of breaking down a text into individual words or tokens. TextBlob provides a method called word_tokenize() which takes a string of text as input and returns a list of words or tokens extracted from that text. This function splits the text based on whitespace and punctuation, effectively separating it into its constituent words.

In [None]:
textblob_tokens = TextBlob(para).words
print(textblob_tokens)

['The', 'sun', '☀️', 'rose', 'gracefully', 'over', 'the', 'horizon', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', '🌅', 'Birds', '🐦', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', 'while', 'a', 'gentle', 'breeze', '🍃', 'whispered', 'through', 'the', 'trees', '🌳', 'In', 'the', 'distance', 'the', 'sound', 'of', 'waves', '🌊', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', 'It', 'was', 'a', 'perfect', 'moment', 'filled', 'with', 'peace', '✌️', 'and', 'tranquility']


#spaCy tokenizer


spaCy  is a component of the spaCy library that is responsible for breaking down a text into individual tokens, such as words or punctuation marks.

In [None]:
nlp = spacy.load("en_core_web_sm")
spacy_tokens = [token.text for token in nlp(para)]
print(spacy_tokens)

['The', 'sun', '☀', '️', 'rose', 'gracefully', 'over', 'the', 'horizon', ',', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', '🌅', '.', 'Birds', '🐦', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', ',', 'while', 'a', 'gentle', 'breeze', '🍃', 'whispered', 'through', 'the', 'trees', '🌳', '.', 'In', 'the', 'distance', ',', 'the', 'sound', 'of', 'waves', '🌊', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', '.', 'It', 'was', 'a', 'perfect', 'moment', ',', 'filled', 'with', 'peace', '✌', '️', 'and', 'tranquility', '.']


#Genism Word tokenizer

The gensim.utils.tokenize() function is a part of Gensim's utilities for text processing. It tokenizes the input text (para) into individual tokens or words. The output gensim_tokens would be an iterable object containing the tokens extracted from the paragraph.

In [None]:
gensim_tokens = gensim.utils.tokenize(para)
gensim_tokens = list(tokenize(para))
print(gensim_tokens)

['The', 'sun', 'rose', 'gracefully', 'over', 'the', 'horizon', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', 'Birds', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', 'while', 'a', 'gentle', 'breeze', 'whispered', 'through', 'the', 'trees', 'In', 'the', 'distance', 'the', 'sound', 'of', 'waves', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', 'It', 'was', 'a', 'perfect', 'moment', 'filled', 'with', 'peace', 'and', 'tranquility']


# Tokenization with Keras in Python ML
Keras is a high-level neural networks API written in Python, which can be used for various machine learning tasks, including NLP.  With Keras, tokenization can be easily implemented using its built-in text preprocessing utilities, such as the Tokenizer class. This class provides methods to tokenize text, convert it into sequences of integers (which represent the tokens), and handle aspects like padding and truncating sequences to ensure consistent input sizes for neural networks.

In [None]:
# Tokenization with Keras in Python ML
keras_tokens = text_to_word_sequence(para)
print(keras_tokens)

['the', 'sun', '☀️', 'rose', 'gracefully', 'over', 'the', 'horizon', 'painting', 'the', 'sky', 'in', 'hues', 'of', 'pink', 'and', 'orange', '🌅', 'birds', '🐦', 'chirped', 'cheerfully', 'as', 'they', 'welcomed', 'the', 'new', 'day', 'while', 'a', 'gentle', 'breeze', '🍃', 'whispered', 'through', 'the', 'trees', '🌳', 'in', 'the', 'distance', 'the', 'sound', 'of', 'waves', '🌊', 'crashing', 'against', 'the', 'shore', 'added', 'a', 'soothing', 'rhythm', 'to', 'the', 'morning', 'symphony', 'it', 'was', 'a', 'perfect', 'moment', 'filled', 'with', 'peace', '✌️', 'and', 'tranquility']
