In [2]:
sample_document = "Tokenization is the process of breaking down text into words and punctuation. POS tagging identifies the grammatical parts of speech of each word. Stop words are common words that are often removed. Stemming reduces words to their base or root form. Lemmatization is similar to stemming but aims to return the base or dictionary form of a word."


In [3]:
import nltk
from nltk import word_tokenize

tokens = word_tokenize(sample_document)
print(tokens)


['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'words', 'and', 'punctuation', '.', 'POS', 'tagging', 'identifies', 'the', 'grammatical', 'parts', 'of', 'speech', 'of', 'each', 'word', '.', 'Stop', 'words', 'are', 'common', 'words', 'that', 'are', 'often', 'removed', '.', 'Stemming', 'reduces', 'words', 'to', 'their', 'base', 'or', 'root', 'form', '.', 'Lemmatization', 'is', 'similar', 'to', 'stemming', 'but', 'aims', 'to', 'return', 'the', 'base', 'or', 'dictionary', 'form', 'of', 'a', 'word', '.']


In [4]:
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)


[('Tokenization', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('breaking', 'VBG'), ('down', 'RP'), ('text', 'RB'), ('into', 'IN'), ('words', 'NNS'), ('and', 'CC'), ('punctuation', 'NN'), ('.', '.'), ('POS', 'NNP'), ('tagging', 'VBG'), ('identifies', 'NNS'), ('the', 'DT'), ('grammatical', 'JJ'), ('parts', 'NNS'), ('of', 'IN'), ('speech', 'NN'), ('of', 'IN'), ('each', 'DT'), ('word', 'NN'), ('.', '.'), ('Stop', 'VB'), ('words', 'NNS'), ('are', 'VBP'), ('common', 'JJ'), ('words', 'NNS'), ('that', 'WDT'), ('are', 'VBP'), ('often', 'RB'), ('removed', 'VBN'), ('.', '.'), ('Stemming', 'VBG'), ('reduces', 'NNS'), ('words', 'NNS'), ('to', 'TO'), ('their', 'PRP$'), ('base', 'NN'), ('or', 'CC'), ('root', 'NN'), ('form', 'NN'), ('.', '.'), ('Lemmatization', 'NNP'), ('is', 'VBZ'), ('similar', 'JJ'), ('to', 'TO'), ('stemming', 'VBG'), ('but', 'CC'), ('aims', 'VBZ'), ('to', 'TO'), ('return', 'VB'), ('the', 'DT'), ('base', 'NN'), ('or', 'CC'), ('dictionary', 'JJ'), ('form', '

In [5]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)

['Tokenization', 'process', 'breaking', 'text', 'words', 'punctuation', '.', 'POS', 'tagging', 'identifies', 'grammatical', 'parts', 'speech', 'word', '.', 'Stop', 'words', 'common', 'words', 'often', 'removed', '.', 'Stemming', 'reduces', 'words', 'base', 'root', 'form', '.', 'Lemmatization', 'similar', 'stemming', 'aims', 'return', 'base', 'dictionary', 'form', 'word', '.']


In [6]:
from nltk import PorterStemmer

stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)


['token', 'process', 'break', 'text', 'word', 'punctuat', '.', 'po', 'tag', 'identifi', 'grammat', 'part', 'speech', 'word', '.', 'stop', 'word', 'common', 'word', 'often', 'remov', '.', 'stem', 'reduc', 'word', 'base', 'root', 'form', '.', 'lemmat', 'similar', 'stem', 'aim', 'return', 'base', 'dictionari', 'form', 'word', '.']


In [7]:
from nltk import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print(lemmatized_tokens)

['Tokenization', 'process', 'breaking', 'text', 'word', 'punctuation', '.', 'POS', 'tagging', 'identifies', 'grammatical', 'part', 'speech', 'word', '.', 'Stop', 'word', 'common', 'word', 'often', 'removed', '.', 'Stemming', 'reduces', 'word', 'base', 'root', 'form', '.', 'Lemmatization', 'similar', 'stemming', 'aim', 'return', 'base', 'dictionary', 'form', 'word', '.']


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [sample_document]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print("Feature names:")
print(vectorizer.get_feature_names_out())
print(X)
print("\nTF-IDF values:")
print(X.toarray())


Feature names:
['aims' 'and' 'are' 'base' 'breaking' 'but' 'common' 'dictionary' 'down'
 'each' 'form' 'grammatical' 'identifies' 'into' 'is' 'lemmatization' 'of'
 'often' 'or' 'parts' 'pos' 'process' 'punctuation' 'reduces' 'removed'
 'return' 'root' 'similar' 'speech' 'stemming' 'stop' 'tagging' 'text'
 'that' 'the' 'their' 'to' 'tokenization' 'word' 'words']
  (0, 7)	0.09667364890456635
  (0, 25)	0.09667364890456635
  (0, 0)	0.09667364890456635
  (0, 5)	0.09667364890456635
  (0, 27)	0.09667364890456635
  (0, 15)	0.09667364890456635
  (0, 10)	0.1933472978091327
  (0, 26)	0.09667364890456635
  (0, 18)	0.1933472978091327
  (0, 3)	0.1933472978091327
  (0, 35)	0.09667364890456635
  (0, 36)	0.29002094671369905
  (0, 23)	0.09667364890456635
  (0, 29)	0.1933472978091327
  (0, 24)	0.09667364890456635
  (0, 17)	0.09667364890456635
  (0, 33)	0.09667364890456635
  (0, 6)	0.09667364890456635
  (0, 2)	0.1933472978091327
  (0, 30)	0.09667364890456635
  (0, 38)	0.1933472978091327
  (0, 9)	0.0966736