In [1]:

!pip install nltk


import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:

# NLP PREPROCESSING USING NLTK


# Import tokenizers
from nltk.tokenize import (
    WhitespaceTokenizer,
    wordpunct_tokenize,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)

# Import stemmers and lemmatizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

# Sample text
text = "NLTK is a powerful library for NLP. I'm learning tokenization, stemming & lemmatization! #AI #NLP"

print("Original Text:")
print(text)
print("-" * 50)

# 1️⃣ Whitespace Tokenization
wt = WhitespaceTokenizer()
print("1. Whitespace Tokenization:")
print(wt.tokenize(text))
print("-" * 50)

# 2️⃣ Punctuation-based Tokenization
print("2. Punctuation-based Tokenization:")
print(wordpunct_tokenize(text))
print("-" * 50)

# 3️⃣ Treebank Tokenization
tb = TreebankWordTokenizer()
print("3. Treebank Tokenization:")
print(tb.tokenize(text))
print("-" * 50)

# 4️⃣ Tweet Tokenization
tweet_tokenizer = TweetTokenizer()
print("4. Tweet Tokenization:")
print(tweet_tokenizer.tokenize(text))
print("-" * 50)

# 5️⃣ MWE Tokenization
mwe = MWETokenizer([('machine', 'learning'), ('natural', 'language', 'processing')])
sentence = "I am studying machine learning and natural language processing"
print("5. MWE Tokenization:")
print(mwe.tokenize(sentence.split()))
print("-" * 50)


# STEMMING


words = ["running", "runs", "runner", "easily", "fairness"]

# Porter Stemmer
porter = PorterStemmer()
print("Porter Stemming:")
for word in words:
    print(word, "->", porter.stem(word))
print("-" * 50)

# Snowball Stemmer
snowball = SnowballStemmer("english")
print("Snowball Stemming:")
for word in words:
    print(word, "->", snowball.stem(word))
print("-" * 50)


# LEMMATIZATION

lemmatizer = WordNetLemmatizer()
print("Lemmatization:")
print("running (verb) ->", lemmatizer.lemmatize("running", pos="v"))
print("better (adjective) ->", lemmatizer.lemmatize("better", pos="a"))


Original Text:
NLTK is a powerful library for NLP. I'm learning tokenization, stemming & lemmatization! #AI #NLP
--------------------------------------------------
1. Whitespace Tokenization:
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP.', "I'm", 'learning', 'tokenization,', 'stemming', '&', 'lemmatization!', '#AI', '#NLP']
--------------------------------------------------
2. Punctuation-based Tokenization:
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP', '.', 'I', "'", 'm', 'learning', 'tokenization', ',', 'stemming', '&', 'lemmatization', '!', '#', 'AI', '#', 'NLP']
--------------------------------------------------
3. Treebank Tokenization:
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP.', 'I', "'m", 'learning', 'tokenization', ',', 'stemming', '&', 'lemmatization', '!', '#', 'AI', '#', 'NLP']
--------------------------------------------------
4. Tweet Tokenization:
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP', '.', "I'm", 'learning', 'tokenizat