1. Extract Sample document and apply following document preprocessing methods: Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.

In [46]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [48]:
# Sample document
document1 = "Text analytics is the process of deriving meaningful information from natural language text."

In [49]:
# Tokenization
tokenizer = TreebankWordTokenizer()
tokens1 = tokenizer.tokenize(document1)
print("\nTokens: ", tokens1)


Tokens:  ['Text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'meaningful', 'information', 'from', 'natural', 'language', 'text', '.']


In [50]:
# POS (Parts Of Speech) tagging
pos_tags1 = pos_tag(tokens1)
print("\nPOS tags: ", pos_tags1)


POS tags:  [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('meaningful', 'JJ'), ('information', 'NN'), ('from', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('text', 'NN'), ('.', '.')]


In [51]:
# Stop words removal
stop_words = set(stopwords.words("english"))
filtered_tokens1 = [word for word in tokens if word.lower() not in stop_words]
print("\nFiltered words (stop words removed): ", filtered_tokens1)


Filtered words (stop words removed):  ['Text', 'analytics', 'process', 'deriving', 'meaningful', 'information', 'natural', 'language', 'text', '.']


In [52]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens1 = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemmed tokens: ", stemmed_tokens1)


Stemmed tokens:  ['text', 'analyt', 'process', 'deriv', 'meaning', 'inform', 'natur', 'languag', 'text', '.']


In [53]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens1 = [lemmatizer.lemmatize(word) for word in filtered_tokens1]
print("\nLemmatized tokens: ", lemmatized_tokens1)


Lemmatized tokens:  ['Text', 'analytics', 'process', 'deriving', 'meaningful', 'information', 'natural', 'language', 'text', '.']


In [54]:
# Same for another documents
document2 = "Natural language processing and text mining help extract useful insights from text data."
document3 = "Text analytics tools are essential for analyzing large volumes of unstructured text."

tokens2 = tokenizer.tokenize(document2)
tokens3 = tokenizer.tokenize(document3)

pos_tags2 = pos_tag(tokens2)
pos_tags3 = pos_tag(tokens3)

filtered_tokens2 = [word for word in tokens2 if word.lower() not in stop_words]
filtered_tokens3 = [word for word in tokens3 if word.lower() not in stop_words]

lemmatized_tokens2 = [lemmatizer.lemmatize(word) for word in filtered_tokens2]
lemmatized_tokens3 = [lemmatizer.lemmatize(word) for word in filtered_tokens3]

In [55]:
# Term Frequency and Inverse Document Frequency (TF-IDF)
processed_text1 = " ".join(lemmatized_tokens1)
processed_text2 = " ".join(lemmatized_tokens2)
processed_text3 = " ".join(lemmatized_tokens3)

documents = [processed_text1, processed_text2, processed_text3]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

tfidf_feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf_matrix.toarray()

print("\nTF-IDF representation: ")
for i, row in enumerate(tfidf_values):
    print(f"\nTF-IDF for document {i+1}:")
    for word, score in zip(tfidf_feature_names, row):
        if score > 0: # I have only displayed non-zero ones, you can choose to remove this and display all
            print(word, ":", round(score, 4))

# Note: TF-IDF is a measure of how important a word is in a particular document relative to the entire corpus.
# Thats why multiple arrays for each document


TF-IDF representation: 

TF-IDF for document 1:
analytics : 0.2848
deriving : 0.3745
information : 0.3745
language : 0.2848
meaningful : 0.3745
natural : 0.2848
process : 0.3745
text : 0.4424

TF-IDF for document 2:
data : 0.3236
extract : 0.3236
help : 0.3236
insight : 0.3236
language : 0.2461
mining : 0.3236
natural : 0.2461
processing : 0.3236
text : 0.3822
useful : 0.3236

TF-IDF for document 3:
analytics : 0.2693
analyzing : 0.3541
essential : 0.3541
large : 0.3541
text : 0.4183
tool : 0.3541
unstructured : 0.3541
volume : 0.3541
