Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK library. Use porter stemmer and snowball stemmer for stemming. Use any technique for lemmatization.  

In [1]:
import nltk
from nltk.tokenize import (
    WhitespaceTokenizer,
    word_tokenize,
    sent_tokenize,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer


In [6]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
text = """NLTK is great! I'm loving #NLP with Python :) It helps in text processing, tokenization, and language analysis.
 Learning NLP is fun and useful for real-world applications. New York is a big city,
 and many developers use Python there for data science and machine learning. Students practice NLP techniques in classrooms and online courses.
 Social media text, tweets, and comments contain emojis, hashtags, and informal language. Treebank and Tweet tokenizers handle such text efficiently.
Stemming reduces words to their root forms, while lemmatization provides meaningful base words.
These text preprocessing steps improve the performance of natural language models.
"""

In [None]:
#tokenization


In [17]:
#whitespace
wt = WhitespaceTokenizer()
wt_tokens=wt.tokenize(text)
print(wt_tokens)


['NLTK', 'is', 'great!', "I'm", 'loving', '#NLP', 'with', 'Python', ':)', 'It', 'helps', 'in', 'text', 'processing,', 'tokenization,', 'and', 'language', 'analysis.', 'Learning', 'NLP', 'is', 'fun', 'and', 'useful', 'for', 'real-world', 'applications.', 'New', 'York', 'is', 'a', 'big', 'city,', 'and', 'many', 'developers', 'use', 'Python', 'there', 'for', 'data', 'science', 'and', 'machine', 'learning.', 'Students', 'practice', 'NLP', 'techniques', 'in', 'classrooms', 'and', 'online', 'courses.', 'Social', 'media', 'text,', 'tweets,', 'and', 'comments', 'contain', 'emojis,', 'hashtags,', 'and', 'informal', 'language.', 'Treebank', 'and', 'Tweet', 'tokenizers', 'handle', 'such', 'text', 'efficiently.', 'Stemming', 'reduces', 'words', 'to', 'their', 'root', 'forms,', 'while', 'lemmatization', 'provides', 'meaningful', 'base', 'words.', 'These', 'text', 'preprocessing', 'steps', 'improve', 'the', 'performance', 'of', 'natural', 'language', 'models.']


In [18]:

#punctuation-based
tokens_word = word_tokenize(text)
print(tokens_word)

['NLTK', 'is', 'great', '!', 'I', "'m", 'loving', '#', 'NLP', 'with', 'Python', ':', ')', 'It', 'helps', 'in', 'text', 'processing', ',', 'tokenization', ',', 'and', 'language', 'analysis', '.', 'Learning', 'NLP', 'is', 'fun', 'and', 'useful', 'for', 'real-world', 'applications', '.', 'New', 'York', 'is', 'a', 'big', 'city', ',', 'and', 'many', 'developers', 'use', 'Python', 'there', 'for', 'data', 'science', 'and', 'machine', 'learning', '.', 'Students', 'practice', 'NLP', 'techniques', 'in', 'classrooms', 'and', 'online', 'courses', '.', 'Social', 'media', 'text', ',', 'tweets', ',', 'and', 'comments', 'contain', 'emojis', ',', 'hashtags', ',', 'and', 'informal', 'language', '.', 'Treebank', 'and', 'Tweet', 'tokenizers', 'handle', 'such', 'text', 'efficiently', '.', 'Stemming', 'reduces', 'words', 'to', 'their', 'root', 'forms', ',', 'while', 'lemmatization', 'provides', 'meaningful', 'base', 'words', '.', 'These', 'text', 'preprocessing', 'steps', 'improve', 'the', 'performance', 'o

In [19]:
#treebank
tbt = TreebankWordTokenizer()
tbt_tokens = tbt.tokenize(text)
print(tbt_tokens)

['NLTK', 'is', 'great', '!', 'I', "'m", 'loving', '#', 'NLP', 'with', 'Python', ':', ')', 'It', 'helps', 'in', 'text', 'processing', ',', 'tokenization', ',', 'and', 'language', 'analysis.', 'Learning', 'NLP', 'is', 'fun', 'and', 'useful', 'for', 'real-world', 'applications.', 'New', 'York', 'is', 'a', 'big', 'city', ',', 'and', 'many', 'developers', 'use', 'Python', 'there', 'for', 'data', 'science', 'and', 'machine', 'learning.', 'Students', 'practice', 'NLP', 'techniques', 'in', 'classrooms', 'and', 'online', 'courses.', 'Social', 'media', 'text', ',', 'tweets', ',', 'and', 'comments', 'contain', 'emojis', ',', 'hashtags', ',', 'and', 'informal', 'language.', 'Treebank', 'and', 'Tweet', 'tokenizers', 'handle', 'such', 'text', 'efficiently.', 'Stemming', 'reduces', 'words', 'to', 'their', 'root', 'forms', ',', 'while', 'lemmatization', 'provides', 'meaningful', 'base', 'words.', 'These', 'text', 'preprocessing', 'steps', 'improve', 'the', 'performance', 'of', 'natural', 'language', '

In [26]:
#tweet tokenization
tweet_tokenize = TweetTokenizer()
tweet_tokens = tweet_tokenize.tokenize(text)
print(tweet_tokens)

['NLTK', 'is', 'great', '!', "I'm", 'loving', '#NLP', 'with', 'Python', ':)', 'It', 'helps', 'in', 'text', 'processing', ',', 'tokenization', ',', 'and', 'language', 'analysis', '.', 'Learning', 'NLP', 'is', 'fun', 'and', 'useful', 'for', 'real-world', 'applications', '.', 'New', 'York', 'is', 'a', 'big', 'city', ',', 'and', 'many', 'developers', 'use', 'Python', 'there', 'for', 'data', 'science', 'and', 'machine', 'learning', '.', 'Students', 'practice', 'NLP', 'techniques', 'in', 'classrooms', 'and', 'online', 'courses', '.', 'Social', 'media', 'text', ',', 'tweets', ',', 'and', 'comments', 'contain', 'emojis', ',', 'hashtags', ',', 'and', 'informal', 'language', '.', 'Treebank', 'and', 'Tweet', 'tokenizers', 'handle', 'such', 'text', 'efficiently', '.', 'Stemming', 'reduces', 'words', 'to', 'their', 'root', 'forms', ',', 'while', 'lemmatization', 'provides', 'meaningful', 'base', 'words', '.', 'These', 'text', 'preprocessing', 'steps', 'improve', 'the', 'performance', 'of', 'natural

In [25]:
#Multi-word expression
mew_tokenizer = MWETokenizer([('New','York')],separator='_')

tokens_mwe = mew_tokenizer.tokenize(text.split())
print(tokens_mwe)

['NLTK', 'is', 'great!', "I'm", 'loving', '#NLP', 'with', 'Python', ':)', 'It', 'helps', 'in', 'text', 'processing,', 'tokenization,', 'and', 'language', 'analysis.', 'Learning', 'NLP', 'is', 'fun', 'and', 'useful', 'for', 'real-world', 'applications.', 'New_York', 'is', 'a', 'big', 'city,', 'and', 'many', 'developers', 'use', 'Python', 'there', 'for', 'data', 'science', 'and', 'machine', 'learning.', 'Students', 'practice', 'NLP', 'techniques', 'in', 'classrooms', 'and', 'online', 'courses.', 'Social', 'media', 'text,', 'tweets,', 'and', 'comments', 'contain', 'emojis,', 'hashtags,', 'and', 'informal', 'language.', 'Treebank', 'and', 'Tweet', 'tokenizers', 'handle', 'such', 'text', 'efficiently.', 'Stemming', 'reduces', 'words', 'to', 'their', 'root', 'forms,', 'while', 'lemmatization', 'provides', 'meaningful', 'base', 'words.', 'These', 'text', 'preprocessing', 'steps', 'improve', 'the', 'performance', 'of', 'natural', 'language', 'models.']


In [None]:
#Stemming


In [24]:
#porter stemming
porter = PorterStemmer()
porter_stemmed = [porter.stem(word) for word in tokens_word]
print(porter_stemmed)

['nltk', 'is', 'great', '!', 'i', "'m", 'love', '#', 'nlp', 'with', 'python', ':', ')', 'it', 'help', 'in', 'text', 'process', ',', 'token', ',', 'and', 'languag', 'analysi', '.', 'learn', 'nlp', 'is', 'fun', 'and', 'use', 'for', 'real-world', 'applic', '.', 'new', 'york', 'is', 'a', 'big', 'citi', ',', 'and', 'mani', 'develop', 'use', 'python', 'there', 'for', 'data', 'scienc', 'and', 'machin', 'learn', '.', 'student', 'practic', 'nlp', 'techniqu', 'in', 'classroom', 'and', 'onlin', 'cours', '.', 'social', 'media', 'text', ',', 'tweet', ',', 'and', 'comment', 'contain', 'emoji', ',', 'hashtag', ',', 'and', 'inform', 'languag', '.', 'treebank', 'and', 'tweet', 'token', 'handl', 'such', 'text', 'effici', '.', 'stem', 'reduc', 'word', 'to', 'their', 'root', 'form', ',', 'while', 'lemmat', 'provid', 'meaning', 'base', 'word', '.', 'these', 'text', 'preprocess', 'step', 'improv', 'the', 'perform', 'of', 'natur', 'languag', 'model', '.']


In [27]:
#snowball stemmer
snowball = SnowballStemmer('english')
snowball_stemmed = [snowball.stem(word) for word in tokens_word]
print(snowball_stemmed)

['nltk', 'is', 'great', '!', 'i', "'m", 'love', '#', 'nlp', 'with', 'python', ':', ')', 'it', 'help', 'in', 'text', 'process', ',', 'token', ',', 'and', 'languag', 'analysi', '.', 'learn', 'nlp', 'is', 'fun', 'and', 'use', 'for', 'real-world', 'applic', '.', 'new', 'york', 'is', 'a', 'big', 'citi', ',', 'and', 'mani', 'develop', 'use', 'python', 'there', 'for', 'data', 'scienc', 'and', 'machin', 'learn', '.', 'student', 'practic', 'nlp', 'techniqu', 'in', 'classroom', 'and', 'onlin', 'cours', '.', 'social', 'media', 'text', ',', 'tweet', ',', 'and', 'comment', 'contain', 'emoji', ',', 'hashtag', ',', 'and', 'inform', 'languag', '.', 'treebank', 'and', 'tweet', 'token', 'handl', 'such', 'text', 'effici', '.', 'stem', 'reduc', 'word', 'to', 'their', 'root', 'form', ',', 'while', 'lemmat', 'provid', 'meaning', 'base', 'word', '.', 'these', 'text', 'preprocess', 'step', 'improv', 'the', 'perform', 'of', 'natur', 'languag', 'model', '.']


In [None]:
#lemmatization


In [28]:
#wordnetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens_word]
print(lemmatized_words)

['NLTK', 'is', 'great', '!', 'I', "'m", 'loving', '#', 'NLP', 'with', 'Python', ':', ')', 'It', 'help', 'in', 'text', 'processing', ',', 'tokenization', ',', 'and', 'language', 'analysis', '.', 'Learning', 'NLP', 'is', 'fun', 'and', 'useful', 'for', 'real-world', 'application', '.', 'New', 'York', 'is', 'a', 'big', 'city', ',', 'and', 'many', 'developer', 'use', 'Python', 'there', 'for', 'data', 'science', 'and', 'machine', 'learning', '.', 'Students', 'practice', 'NLP', 'technique', 'in', 'classroom', 'and', 'online', 'course', '.', 'Social', 'medium', 'text', ',', 'tweet', ',', 'and', 'comment', 'contain', 'emojis', ',', 'hashtags', ',', 'and', 'informal', 'language', '.', 'Treebank', 'and', 'Tweet', 'tokenizers', 'handle', 'such', 'text', 'efficiently', '.', 'Stemming', 'reduces', 'word', 'to', 'their', 'root', 'form', ',', 'while', 'lemmatization', 'provides', 'meaningful', 'base', 'word', '.', 'These', 'text', 'preprocessing', 'step', 'improve', 'the', 'performance', 'of', 'natura