In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
from nltk.tokenize import MWETokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from textblob import TextBlob
import spacy
import gensim
from keras.preprocessing.text import text_to_word_sequence

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Text to tokenize
text = "Student life can be both exhilarating and stressful at the same time. 📚🎓 Managing assignments, exams, and social obligations can sometimes feel overwhelming, but it's also a time for growth and self-discovery. 🌱🌟 Amidst the hustle and bustle of campus life, it's crucial to find a balance between academic pursuits and personal well-being. Remembering to take breaks, prioritize self-care, and seek support when needed can make all the difference in navigating the challenges of university life. 🤝💆‍♂️ From late-night study sessions to spontaneous adventures with friends, each moment shapes the journey of the student experience. 🌌🚀"

In [None]:
# Word Tokenization
nltk_tokens = nltk.word_tokenize(text)
print (nltk_tokens)

['Student', 'life', 'can', 'be', 'both', 'exhilarating', 'and', 'stressful', 'at', 'the', 'same', 'time', '.', '📚🎓', 'Managing', 'assignments', ',', 'exams', ',', 'and', 'social', 'obligations', 'can', 'sometimes', 'feel', 'overwhelming', ',', 'but', 'it', "'s", 'also', 'a', 'time', 'for', 'growth', 'and', 'self-discovery', '.', '🌱🌟', 'Amidst', 'the', 'hustle', 'and', 'bustle', 'of', 'campus', 'life', ',', 'it', "'s", 'crucial', 'to', 'find', 'a', 'balance', 'between', 'academic', 'pursuits', 'and', 'personal', 'well-being', '.', 'Remembering', 'to', 'take', 'breaks', ',', 'prioritize', 'self-care', ',', 'and', 'seek', 'support', 'when', 'needed', 'can', 'make', 'all', 'the', 'difference', 'in', 'navigating', 'the', 'challenges', 'of', 'university', 'life', '.', '🤝💆\u200d♂️', 'From', 'late-night', 'study', 'sessions', 'to', 'spontaneous', 'adventures', 'with', 'friends', ',', 'each', 'moment', 'shapes', 'the', 'journey', 'of', 'the', 'student', 'experience', '.', '🌌🚀']


Word tokenization is the process of splitting a large sample of text into words. This is a requirement in natural language processing tasks where each word needs to be captured and subjected to further analysis like classifying and counting them for a particular sentiment etc. The Natural Language Tool kit(NLTK) is a library used to achieve this.

In [None]:
# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("\nSentence Tokenization:", sentence_tokens)


Sentence Tokenization: ['Student life can be both exhilarating and stressful at the same time.', "📚🎓 Managing assignments, exams, and social obligations can sometimes feel overwhelming, but it's also a time for growth and self-discovery.", "🌱🌟 Amidst the hustle and bustle of campus life, it's crucial to find a balance between academic pursuits and personal well-being.", 'Remembering to take breaks, prioritize self-care, and seek support when needed can make all the difference in navigating the challenges of university life.', '🤝💆\u200d♂️ From late-night study sessions to spontaneous adventures with friends, each moment shapes the journey of the student experience.', '🌌🚀']


If the text is divided into several sentences inside the punctuation according to the paragraph provided, then it is Sentence Tokenization.

In [None]:
# Punctuation-based Tokenizer
punct_tokenizer = nltk.tokenize.WordPunctTokenizer()
punct_tokens = punct_tokenizer.tokenize(text)
print("\nPunctuation-based Tokenizer:", punct_tokens)



Punctuation-based Tokenizer: ['Student', 'life', 'can', 'be', 'both', 'exhilarating', 'and', 'stressful', 'at', 'the', 'same', 'time', '.', '📚🎓', 'Managing', 'assignments', ',', 'exams', ',', 'and', 'social', 'obligations', 'can', 'sometimes', 'feel', 'overwhelming', ',', 'but', 'it', "'", 's', 'also', 'a', 'time', 'for', 'growth', 'and', 'self', '-', 'discovery', '.', '🌱🌟', 'Amidst', 'the', 'hustle', 'and', 'bustle', 'of', 'campus', 'life', ',', 'it', "'", 's', 'crucial', 'to', 'find', 'a', 'balance', 'between', 'academic', 'pursuits', 'and', 'personal', 'well', '-', 'being', '.', 'Remembering', 'to', 'take', 'breaks', ',', 'prioritize', 'self', '-', 'care', ',', 'and', 'seek', 'support', 'when', 'needed', 'can', 'make', 'all', 'the', 'difference', 'in', 'navigating', 'the', 'challenges', 'of', 'university', 'life', '.', '🤝💆\u200d♂️', 'From', 'late', '-', 'night', 'study', 'sessions', 'to', 'spontaneous', 'adventures', 'with', 'friends', ',', 'each', 'moment', 'shapes', 'the', 'journ

Punctuation Tokenizer is a process of splitting a paragraph or a sentence into words within a punctuation. That may include words, emojis, special characters and so on..

In [None]:
# Treebank Word tokenizer
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
print("\nTreebank Word tokenizer:", treebank_tokens)


Treebank Word tokenizer: ['Student', 'life', 'can', 'be', 'both', 'exhilarating', 'and', 'stressful', 'at', 'the', 'same', 'time.', '📚🎓', 'Managing', 'assignments', ',', 'exams', ',', 'and', 'social', 'obligations', 'can', 'sometimes', 'feel', 'overwhelming', ',', 'but', 'it', "'s", 'also', 'a', 'time', 'for', 'growth', 'and', 'self-discovery.', '🌱🌟', 'Amidst', 'the', 'hustle', 'and', 'bustle', 'of', 'campus', 'life', ',', 'it', "'s", 'crucial', 'to', 'find', 'a', 'balance', 'between', 'academic', 'pursuits', 'and', 'personal', 'well-being.', 'Remembering', 'to', 'take', 'breaks', ',', 'prioritize', 'self-care', ',', 'and', 'seek', 'support', 'when', 'needed', 'can', 'make', 'all', 'the', 'difference', 'in', 'navigating', 'the', 'challenges', 'of', 'university', 'life.', '🤝💆\u200d♂️', 'From', 'late-night', 'study', 'sessions', 'to', 'spontaneous', 'adventures', 'with', 'friends', ',', 'each', 'moment', 'shapes', 'the', 'journey', 'of', 'the', 'student', 'experience.', '🌌🚀']


The Treebank tokenizer is a type of tokenizer that it separates out clitics, that usually appear only in combination with another word, like "I'm","they'll".
It Treats most punctuation characters as separate tokens.
It splits off comas and single quotes when they're followed by whitespace.

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

# Detokenize using TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()
detokenized_text = detokenizer.detokenize(treebank_tokens)

print("Detokenized text:", detokenized_text)


Detokenized text: Student life can be both exhilarating and stressful at the same time. 📚🎓 Managing assignments, exams, and social obligations can sometimes feel overwhelming, but it's also a time for growth and self-discovery. 🌱🌟 Amidst the hustle and bustle of campus life, it's crucial to find a balance between academic pursuits and personal well-being. Remembering to take breaks, prioritize self-care, and seek support when needed can make all the difference in navigating the challenges of university life. 🤝💆‍♂️ From late-night study sessions to spontaneous adventures with friends, each moment shapes the journey of the student experience. 🌌🚀


Just Experimented detokenizer for same and checked the result

In [None]:
# Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("\nTweet Tokenizer:", tweet_tokens)


Tweet Tokenizer: ['Student', 'life', 'can', 'be', 'both', 'exhilarating', 'and', 'stressful', 'at', 'the', 'same', 'time', '.', '📚', '🎓', 'Managing', 'assignments', ',', 'exams', ',', 'and', 'social', 'obligations', 'can', 'sometimes', 'feel', 'overwhelming', ',', 'but', "it's", 'also', 'a', 'time', 'for', 'growth', 'and', 'self-discovery', '.', '🌱', '🌟', 'Amidst', 'the', 'hustle', 'and', 'bustle', 'of', 'campus', 'life', ',', "it's", 'crucial', 'to', 'find', 'a', 'balance', 'between', 'academic', 'pursuits', 'and', 'personal', 'well-being', '.', 'Remembering', 'to', 'take', 'breaks', ',', 'prioritize', 'self-care', ',', 'and', 'seek', 'support', 'when', 'needed', 'can', 'make', 'all', 'the', 'difference', 'in', 'navigating', 'the', 'challenges', 'of', 'university', 'life', '.', '🤝', '💆\u200d♂', '️', 'From', 'late-night', 'study', 'sessions', 'to', 'spontaneous', 'adventures', 'with', 'friends', ',', 'each', 'moment', 'shapes', 'the', 'journey', 'of', 'the', 'student', 'experience', '

It is a specialized tokenizer designed to handle tweets and other social media text. Due to the distinctive features of tweets, such as hashtags, mentions, emojis, abbreviations, and URLs, they demand unique treatment during tokenization.

In [None]:
tweet = '@aminbaybon HEllllllllo everybody i wish you a good #day'
tokenizer = TweetTokenizer(preserve_case=False)
tweet_tokens = tokenizer.tokenize(tweet)
print(tweet_tokens)

['@aminbaybon', 'hellllllllo', 'everybody', 'i', 'wish', 'you', 'a', 'good', '#day']


In [None]:
tweet = '@aminbaybon HEllllllllo everybody i wish you a good #day'
tokenizer = TweetTokenizer(strip_handles=True)
tweet_tokens = tokenizer.tokenize(tweet)
print(tweet_tokens)

['HEllllllllo', 'everybody', 'i', 'wish', 'you', 'a', 'good', '#day']


In [None]:
tweet = '@aminbaybon HEllllllllo everybody i wish you a good #day 415 123 1234'
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=False , strip_handles=True , match_phone_numbers = True)
tweet_tokens = tokenizer.tokenize(tweet)
print(tweet_tokens)

['hellllllllo', 'everybody', 'i', 'wish', 'you', 'a', 'good', '#day', '415 123 1234']


Enabling this parameter ensures that the phone number is tokenized as a single unit without any number divisions.

In [None]:
# Multi-Word Expression Tokenizer
mwe_tokenizer = MWETokenizer([('Student', 'life'), ('of', 'campus')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))
print("\nMulti-Word Expression Tokenizer:", mwe_tokens)


Multi-Word Expression Tokenizer: ['Student_life', 'can', 'be', 'both', 'exhilarating', 'and', 'stressful', 'at', 'the', 'same', 'time', '.', '📚🎓', 'Managing', 'assignments', ',', 'exams', ',', 'and', 'social', 'obligations', 'can', 'sometimes', 'feel', 'overwhelming', ',', 'but', 'it', "'s", 'also', 'a', 'time', 'for', 'growth', 'and', 'self-discovery', '.', '🌱🌟', 'Amidst', 'the', 'hustle', 'and', 'bustle', 'of_campus', 'life', ',', 'it', "'s", 'crucial', 'to', 'find', 'a', 'balance', 'between', 'academic', 'pursuits', 'and', 'personal', 'well-being', '.', 'Remembering', 'to', 'take', 'breaks', ',', 'prioritize', 'self-care', ',', 'and', 'seek', 'support', 'when', 'needed', 'can', 'make', 'all', 'the', 'difference', 'in', 'navigating', 'the', 'challenges', 'of', 'university', 'life', '.', '🤝💆\u200d♂️', 'From', 'late-night', 'study', 'sessions', 'to', 'spontaneous', 'adventures', 'with', 'friends', ',', 'each', 'moment', 'shapes', 'the', 'journey', 'of', 'the', 'student', 'experience',

Multi-word tokenization, also known as phrase tokenization or multi-word expression (MWE) tokenization, is the process of splitting text into meaningful multi-word units rather than individual words. This is particularly useful in natural language processing (NLP) tasks where understanding the meaning of phrases or idiomatic expressions is important.

In [None]:
#Textblob Tokenizer
text = ("Natural language processing (NLP) is a field " +
       "of computer science, artificial intelligence " +
       "and computational linguistics concerned with " +
       "the interactions between computers and human " +
       "(natural) languages, and, in particular, " +
       "concerned with programming computers to " +
       "fruitfully process large natural language " +
       "corpora. Challenges in natural language " +
       "processing frequently involve natural " +
       "language understanding, natural language" +
       "generation frequently from formal, machine" +
       "-readable logical forms), connecting language " +
       "and machine perception, managing human-" +
       "computer dialog systems, or some combination " +
       "thereof.")

# create a TextBlob object
blob_object = TextBlob(text)

# tokenize paragraph into words.
print(" Word Tokenize :\n", blob_object.words)

# tokenize paragraph into sentences.
print("\n Sentence Tokenize :\n", blob_object.sentences)

 Word Tokenize :
 ['Natural', 'language', 'processing', 'NLP', 'is', 'a', 'field', 'of', 'computer', 'science', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages', 'and', 'in', 'particular', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', 'natural', 'languagegeneration', 'frequently', 'from', 'formal', 'machine-readable', 'logical', 'forms', 'connecting', 'language', 'and', 'machine', 'perception', 'managing', 'human-computer', 'dialog', 'systems', 'or', 'some', 'combination', 'thereof']

 Sentence Tokenize :
 [Sentence("Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions

TextBlob is a Python library for processing textual data. Its tokenization functionality splits text into individual words or sentences.

In [None]:
nlp = spacy.blank("en")

doc = nlp("Spacy is a library that comes under NLP (Natural Language Processing). It is an object-oriented Library that is used to deal with pre-processing of text, and sentences, and to extract information from the text using modules and functions.")

for token in doc:
    print(token)

Spacy
is
a
library
that
comes
under
NLP
(
Natural
Language
Processing
)
.
It
is
an
object
-
oriented
Library
that
is
used
to
deal
with
pre
-
processing
of
text
,
and
sentences
,
and
to
extract
information
from
the
text
using
modules
and
functions
.


In [None]:
# spaCy Tokenizer
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
spacy_tokens = [token.text for token in doc]
print("\nspaCy Tokenizer:", spacy_tokens)


spaCy Tokenizer: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', '.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', ',', 'natural', 'languagegeneration', 'frequently', 'from', 'formal', ',', 'machine', '-', 'readable', 'logical', 'forms', ')', ',', 'connecting', 'language', 'and', 'machine', 'perception', ',', 'managing', 'human', '-', 'computer', 'dialog', 'systems', ',', 'or', 'some', 'combination', 'thereof', '.']


spaCy's tokenizer breaks raw text into tokens efficiently, handling languages, punctuation, contractions, and special characters intelligently, crucial for various NLP tasks.

In [None]:
# Gensim word tokenizer
gensim_tokens = gensim.utils.tokenize(text)
print("\nGensim word tokenizer:", list(gensim_tokens))


Gensim word tokenizer: ['Natural', 'language', 'processing', 'NLP', 'is', 'a', 'field', 'of', 'computer', 'science', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'natural', 'languages', 'and', 'in', 'particular', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding', 'natural', 'languagegeneration', 'frequently', 'from', 'formal', 'machine', 'readable', 'logical', 'forms', 'connecting', 'language', 'and', 'machine', 'perception', 'managing', 'human', 'computer', 'dialog', 'systems', 'or', 'some', 'combination', 'thereof']



Gensim, another popular NLP library, offers a word tokenizer that efficiently breaks text into tokens, providing support for various languages, punctuation, contractions, and special characters.


Keras, a deep learning library, provides tokenization tools that split text into tokens efficiently, handling languages, punctuation, contractions, and special characters.