In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [None]:
#downloading essentials
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sumit\AppData\Roaming\nltk_data...


True

In [None]:
#creating document
document = "Natural Language Processing helps computers understand human language. It includes tasks like text classification, sentiment analysis, and machine translation."

In [None]:
#converting text into tokens
tokens = word_tokenize(document)

In [None]:
#adding parts of speech(pos) to tokens
pos_tagging=pos_tag(tokens)

In [None]:
#Removing noisy words from set of tokens which are not present in stop words and not has punctuations
stop_words=set(stopwords.words('english'))
filter_words=[word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]

In [None]:
#converting filter words into root words
stemmer=PorterStemmer()
stem_tokens=[stemmer.stem(word) for word in filter_words]

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filter_words]

In [None]:
print("Original Tokens:", tokens)
print()
print("POS Tags:", pos_tagging)
print()
print("Filtered Tokens (No Stop Words):", filter_words)
print()
print("Stemmed Tokens:", stem_tokens)
print()
print("Lemmatized Tokens:", lemmatized_tokens)
print()

Original Tokens: ['Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language', '.', 'It', 'includes', 'tasks', 'like', 'text', 'classification', ',', 'sentiment', 'analysis', ',', 'and', 'machine', 'translation', '.']

POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('includes', 'VBZ'), ('tasks', 'NNS'), ('like', 'IN'), ('text', 'JJ'), ('classification', 'NN'), (',', ','), ('sentiment', 'NN'), ('analysis', 'NN'), (',', ','), ('and', 'CC'), ('machine', 'NN'), ('translation', 'NN'), ('.', '.')]

Filtered Tokens (No Stop Words): ['Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language', 'includes', 'tasks', 'like', 'text', 'classification', 'sentiment', 'analysis', 'machine', 'translation']

Stemmed Tokens: ['natur', 'languag', 'process', 'help', 'comput', 'understand', 

In [None]:
documents = [
    document,
    "Machine learning is a subfield of artificial intelligence that focuses on building systems that learn from data.",
    "Text classification is a task in NLP to categorize text into organized groups."
]

In [None]:
#technique to see term frequency and inverse term frequency...how important that term has in the given document
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

In [None]:
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


TF-IDF Matrix:
[[0.22510195 0.22510195 0.         0.         0.         0.17119583
  0.22510195 0.         0.         0.         0.         0.22510195
  0.22510195 0.         0.22510195 0.         0.         0.
  0.22510195 0.45020389 0.         0.         0.22510195 0.17119583
  0.22510195 0.         0.         0.         0.         0.22510195
  0.22510195 0.         0.         0.         0.22510195 0.17119583
  0.         0.         0.22510195 0.22510195]
 [0.         0.         0.24142479 0.24142479 0.         0.
  0.         0.24142479 0.24142479 0.24142479 0.         0.
  0.         0.         0.         0.24142479 0.         0.18360978
  0.         0.         0.24142479 0.24142479 0.         0.18360978
  0.         0.         0.24142479 0.24142479 0.         0.
  0.         0.24142479 0.24142479 0.         0.         0.
  0.48284959 0.         0.         0.        ]
 [0.         0.         0.         0.         0.29526419 0.22455603
  0.         0.         0.         0.         

In [None]:
print("\nFeature Names (Vocabulary):")
print(vectorizer.get_feature_names_out())


Feature Names (Vocabulary):
['analysis' 'and' 'artificial' 'building' 'categorize' 'classification'
 'computers' 'data' 'focuses' 'from' 'groups' 'helps' 'human' 'in'
 'includes' 'intelligence' 'into' 'is' 'it' 'language' 'learn' 'learning'
 'like' 'machine' 'natural' 'nlp' 'of' 'on' 'organized' 'processing'
 'sentiment' 'subfield' 'systems' 'task' 'tasks' 'text' 'that' 'to'
 'translation' 'understand']
