In [63]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [64]:
pip install --upgrade nltk


Note: you may need to restart the kernel to use updated packages.


In [65]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\avant\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\avant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [66]:
# Sample document
document = "Text analytics is the process of deriving information from text sources. It involves several steps such as tokenization, POS tagging, stop words removal, stemming, and lemmatization."

In [67]:
# Tokenization
tokens = word_tokenize(document)
print(tokens)

['Text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'information', 'from', 'text', 'sources', '.', 'It', 'involves', 'several', 'steps', 'such', 'as', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.']


In [68]:
# POS tagging
pos_tags = pos_tag(tokens)
print(pos_tags)

[('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('information', 'NN'), ('from', 'IN'), ('text', 'NN'), ('sources', 'NNS'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('several', 'JJ'), ('steps', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]


In [69]:
# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)

['Text', 'analytics', 'process', 'deriving', 'information', 'text', 'sources', '.', 'involves', 'several', 'steps', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'lemmatization', '.']


In [70]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)

['text', 'analyt', 'process', 'deriv', 'inform', 'text', 'sourc', '.', 'involv', 'sever', 'step', 'token', ',', 'po', 'tag', ',', 'stop', 'word', 'remov', ',', 'stem', ',', 'lemmat', '.']


In [71]:

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print(lemmatized_tokens)

LookupError: 
**********************************************************************
  Resource [93momw-1.4[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('omw-1.4')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/omw-1.4[0m

  Searched in:
    - 'C:\\Users\\avant/nltk_data'
    - 'C:\\Users\\avant\\anaconda3\\nltk_data'
    - 'C:\\Users\\avant\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\avant\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\avant\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


#Sample documents
documents = [
    "Text analytics is the process of deriving meaningful insights from textual data.",
    "Text mining techniques are used to analyze large volumes of unstructured text.",
    "Natural language processing helps in understanding and generating human language.",
]

#Create TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

#Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names()

#Print TF-IDF values for each document
for i in range(len(documents)):
    print(f"Document {i+1}:")
    for j in range(len(feature_names)):
        print(f"{feature_names[j]}: {tfidf_matrix[i, j]}")

In [61]:
# Print the results
print("Original Document:")
print(document)
print("\nTokenization:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nStop Words Removal:")
print(filtered_tokens)
print("\nStemming:")
print(stemmed_tokens)
print("\nLemmatization:")
print(lemmatized_tokens)

Original Document:
Text analytics is the process of deriving information from text sources. It involves several steps such as tokenization, POS tagging, stop words removal, stemming, and lemmatization.

Tokenization:
['Text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'information', 'from', 'text', 'sources', '.', 'It', 'involves', 'several', 'steps', 'such', 'as', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.']

POS Tagging:
[('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('information', 'NN'), ('from', 'IN'), ('text', 'NN'), ('sources', 'NNS'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('several', 'JJ'), ('steps', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), (

NameError: name 'lemmatized_tokens' is not defined

representation of the document by calculating Term Frequency (TF) and Inverse Document Frequency (IDF).