In [None]:
# Install and import required packages
!pip install nltk
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample text
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."

# Step 1: Lowercasing
text = text.lower()
print("Lowercased Text:", text)

# Step 2: Removing punctuation
text_clean = "".join([ch for ch in text if ch not in string.punctuation])
print("Without Punctuation:", text_clean)

# Step 3: Tokenization
words = word_tokenize(text_clean)
print("Tokenized Words:", words)

# Step 4: Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w not in stop_words]
print("Filtered Words (No Stopwords):", filtered_words)

# Step 5: Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(w) for w in filtered_words]
print("Stemmed Words:", stemmed_words)

# Step 6: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]
print("Lemmatized Words:", lemmatized_words)

# Step 7: POS Tagging
pos_tags = pos_tag(filtered_words)
print("POS Tags:", pos_tags)

# Step 8: TF-IDF Representation
corpus = [text_clean]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print("\nTF-IDF Matrix:\n", X.toarray())
print("TF-IDF Feature Names:", vectorizer.get_feature_names_out())
