In [4]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

# Sample Document
document = "Text analytics is the process of deriving meaningful information from natural language text."

# --- 1. Preprocessing ---

# Tokenization
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(document)
print("\nTokens:\n", tokens)

# POS Tagging
pos_tags = pos_tag(tokens)
print("\nPOS Tags:\n", pos_tags)

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nAfter Stop Words Removal:\n", filtered_tokens)

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nAfter Stemming:\n", stemmed_tokens)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nAfter Lemmatization:\n", lemmatized_tokens)

# --- 2. Term Frequency and Inverse Document Frequency (TF-IDF) ---

# Use the lemmatized tokens joined into text
processed_text = " ".join(lemmatized_tokens)

# Since TF-IDF needs a list of documents, create a list
documents = [processed_text]

# Create TF-IDF model
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert TF-IDF result to readable form
tfidf_feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf_matrix.toarray()

print("\nTF-IDF Representation:")
for word, score in zip(tfidf_feature_names, tfidf_values[0]):
    print(f"{word}: {score:.4f}")



Tokens:
 ['Text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'meaningful', 'information', 'from', 'natural', 'language', 'text', '.']

POS Tags:
 [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('meaningful', 'JJ'), ('information', 'NN'), ('from', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('text', 'NN'), ('.', '.')]

After Stop Words Removal:
 ['Text', 'analytics', 'process', 'deriving', 'meaningful', 'information', 'natural', 'language', 'text', '.']

After Stemming:
 ['text', 'analyt', 'process', 'deriv', 'meaning', 'inform', 'natur', 'languag', 'text', '.']

After Lemmatization:
 ['Text', 'analytics', 'process', 'deriving', 'meaningful', 'information', 'natural', 'language', 'text', '.']

TF-IDF Representation:
analytics: 0.3015
deriving: 0.3015
information: 0.3015
language: 0.3015
meaningful: 0.3015
natural: 0.3015
process: 0.3015
text: 0.6030


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
