In [1]:
import nltk

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample document
document = "This is a sample document. It contains multiple sentences. We will perform preprocessing on this document."

# Tokenization
tokens = word_tokenize(document)

# POS Tagging
pos_tags = nltk.pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# Calculate Term Frequency and Inverse Document Frequency
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([document])

# Print the results
print("Original Document:", document)
print("Tokenization:", tokens)
print("POS Tagging:", pos_tags)
print("Stop Words Removal:", filtered_tokens)
print("Stemming:", stemmed_tokens)
print("Lemmatization:", lemmatized_tokens)
print("Term Frequency and Inverse Document Frequency:", tfidf_matrix.toarray())


Original Document: This is a sample document. It contains multiple sentences. We will perform preprocessing on this document.
Tokenization: ['This', 'is', 'a', 'sample', 'document', '.', 'It', 'contains', 'multiple', 'sentences', '.', 'We', 'will', 'perform', 'preprocessing', 'on', 'this', 'document', '.']
POS Tagging: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('document', 'NN'), ('.', '.'), ('It', 'PRP'), ('contains', 'VBZ'), ('multiple', 'JJ'), ('sentences', 'NNS'), ('.', '.'), ('We', 'PRP'), ('will', 'MD'), ('perform', 'VB'), ('preprocessing', 'VBG'), ('on', 'IN'), ('this', 'DT'), ('document', 'NN'), ('.', '.')]
Stop Words Removal: ['sample', 'document', '.', 'contains', 'multiple', 'sentences', '.', 'perform', 'preprocessing', 'document', '.']
Stemming: ['sampl', 'document', '.', 'contain', 'multipl', 'sentenc', '.', 'perform', 'preprocess', 'document', '.']
Lemmatization: ['sample', 'document', '.', 'contains', 'multiple', 'sentence', '.', 'perform', 'preproce