# NLP Preprocessing Using NLTK

In [19]:
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [20]:
text = "NLTK is a powerful Python library for natural language processing! It helps with tokenization, stemming, and more."

In [21]:
# Sentence tokenization
sent_tokens = sent_tokenize(text)
print("Sentence Tokens:", sent_tokens)

# Word tokenization
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

Sentence Tokens: ['NLTK is a powerful Python library for natural language processing!', 'It helps with tokenization, stemming, and more.']
Word Tokens: ['NLTK', 'is', 'a', 'powerful', 'Python', 'library', 'for', 'natural', 'language', 'processing', '!', 'It', 'helps', 'with', 'tokenization', ',', 'stemming', ',', 'and', 'more', '.']


In [22]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in word_tokens if word not in stop_words]
print("Tokens without Stopwords:", filtered_tokens)

Tokens without Stopwords: ['NLTK', 'powerful', 'Python', 'library', 'natural', 'language', 'processing', '!', 'It', 'helps', 'tokenization', ',', 'stemming', ',', '.']


In [23]:
# Stemming(Reduce words to their root form)
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Words:", stemmed_words)

Stemmed Words: ['nltk', 'power', 'python', 'librari', 'natur', 'languag', 'process', '!', 'it', 'help', 'token', ',', 'stem', ',', '.']


In [24]:
# Lemmitization(More accurate word normalization)
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['NLTK', 'powerful', 'Python', 'library', 'natural', 'language', 'processing', '!', 'It', 'help', 'tokenization', ',', 'stemming', ',', '.']


In [25]:
pos_tags = pos_tag(filtered_tokens)
print("POS Tags:", pos_tags)

POS Tags: [('NLTK', 'NNP'), ('powerful', 'JJ'), ('Python', 'NNP'), ('library', 'JJ'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('!', '.'), ('It', 'PRP'), ('helps', 'VBZ'), ('tokenization', 'NN'), (',', ','), ('stemming', 'VBG'), (',', ','), ('.', '.')]


# NLP Preprocessing Using Hugging Face

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [27]:
# Tokenize text
tokens = tokenizer.tokenize(text)
print("Hugging Face Tokenized:", tokens)

Hugging Face Tokenized: ['nl', '##t', '##k', 'is', 'a', 'powerful', 'python', 'library', 'for', 'natural', 'language', 'processing', '!', 'it', 'helps', 'with', 'token', '##ization', ',', 'stemming', ',', 'and', 'more', '.']


In [28]:
input_ids = tokenizer.encode(text)
print("Input IDs:", input_ids)

Input IDs: [101, 17953, 2102, 2243, 2003, 1037, 3928, 18750, 3075, 2005, 3019, 2653, 6364, 999, 2009, 7126, 2007, 19204, 3989, 1010, 29217, 1010, 1998, 2062, 1012, 102]


In [29]:
decoded_text = tokenizer.decode(input_ids)
print("Decoded Text:", decoded_text)

Decoded Text: [CLS] nltk is a powerful python library for natural language processing! it helps with tokenization, stemming, and more. [SEP]
