In [None]:
import nltk
import pandas as pd
from collections import Counter
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('omw-1.4')

from nltk import pos_tag
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
document = "Life is like a camera. Focus on what's important, capture the good times, develop from the negatives, and if things don't work out, take another shot."

Tokenization: splitting text into individual words.

In [None]:
tok_words = word_tokenize(document)
print("Original doc: ", document)
print("Tokenized doc: ", tok_words)

Original doc:  Life is like a camera. Focus on what's important, capture the good times, develop from the negatives, and if things don't work out, take another shot.
Tokenized doc:  ['Life', 'is', 'like', 'a', 'camera', '.', 'Focus', 'on', 'what', "'s", 'important', ',', 'capture', 'the', 'good', 'times', ',', 'develop', 'from', 'the', 'negatives', ',', 'and', 'if', 'things', 'do', "n't", 'work', 'out', ',', 'take', 'another', 'shot', '.']


POS Tagging: assigning a grammatical category (such as noun, verb, adjective, etc.) to each word in a sentence

In [None]:
pos_words = pos_tag(tok_words)
print("POS Tags: ", pos_words)

POS Tags:  [('Life', 'NNP'), ('is', 'VBZ'), ('like', 'IN'), ('a', 'DT'), ('camera', 'NN'), ('.', '.'), ('Focus', 'VB'), ('on', 'IN'), ('what', 'WP'), ("'s", 'VBZ'), ('important', 'JJ'), (',', ','), ('capture', 'VB'), ('the', 'DT'), ('good', 'JJ'), ('times', 'NNS'), (',', ','), ('develop', 'VB'), ('from', 'IN'), ('the', 'DT'), ('negatives', 'NNS'), (',', ','), ('and', 'CC'), ('if', 'IN'), ('things', 'NNS'), ('do', 'VBP'), ("n't", 'RB'), ('work', 'VB'), ('out', 'RP'), (',', ','), ('take', 'VB'), ('another', 'DT'), ('shot', 'NN'), ('.', '.')]


Stopword Removal

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in tok_words if not w in stop_words]

print("Filtered doc: ", filtered_words)

Filtered doc:  ['Life', 'like', 'camera', '.', 'Focus', "'s", 'important', ',', 'capture', 'good', 'times', ',', 'develop', 'negatives', ',', 'things', "n't", 'work', ',', 'take', 'another', 'shot', '.']


Stemming

In [None]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(words) for words in filtered_words]
print("Stemmed words: ",stemmed_words)

Stemmed words:  ['life', 'like', 'camera', '.', 'focu', "'s", 'import', ',', 'captur', 'good', 'time', ',', 'develop', 'neg', ',', 'thing', "n't", 'work', ',', 'take', 'anoth', 'shot', '.']


Lemmatization

In [None]:
lem = WordNetLemmatizer()
lem_words = [lem.lemmatize(word) for word in filtered_words]
print("Lemmatized words: ", lem_words)

Lemmatized words:  ['Life', 'like', 'camera', '.', 'Focus', "'s", 'important', ',', 'capture', 'good', 'time', ',', 'develop', 'negative', ',', 'thing', "n't", 'work', ',', 'take', 'another', 'shot', '.']


TF IDF

In [None]:
doc = ["The quick brown fox jumps over the lazy dog",
       "The lazy cat sleeps on the brown rug",
       "Brown bears are common in this area",
       "The quick fox runs faster than the brown dog",
       "The lazy dog lies down under the brown tree"]

'TfidfVectorizer' - This process converts a collection of raw documents into a matrix of TF-IDF features.

In [None]:
vectorizer = TfidfVectorizer(analyzer = "word", norm = None, use_idf = True, smooth_idf = True)
Mat = vectorizer.fit(doc)
vocabulary = Mat.vocabulary_     #consists of words and their corresponding indices in the feature matrix.

for word, number in vocabulary.items():
    print(f"{word}: {number}")

the: 21
quick: 16
brown: 3
fox: 9
jumps: 11
over: 15
lazy: 12
dog: 6
cat: 4
sleeps: 19
on: 14
rug: 17
bears: 2
are: 0
common: 5
in: 10
this: 22
area: 1
runs: 18
faster: 8
than: 20
lies: 13
down: 7
under: 24
tree: 23


In [None]:
tfidfMat = vectorizer.fit_transform(doc)     # returns a sparse matrix representation of the TF-IDF features for the given document.
print(tfidfMat)

# (row, column)  TFIDF score

  (0, 6)	1.4054651081081644
  (0, 12)	1.4054651081081644
  (0, 15)	2.09861228866811
  (0, 11)	2.09861228866811
  (0, 9)	1.6931471805599454
  (0, 3)	1.0
  (0, 16)	1.6931471805599454
  (0, 21)	2.3646431135879094
  (1, 17)	2.09861228866811
  (1, 14)	2.09861228866811
  (1, 19)	2.09861228866811
  (1, 4)	2.09861228866811
  (1, 12)	1.4054651081081644
  (1, 3)	1.0
  (1, 21)	2.3646431135879094
  (2, 1)	2.09861228866811
  (2, 22)	2.09861228866811
  (2, 10)	2.09861228866811
  (2, 5)	2.09861228866811
  (2, 0)	2.09861228866811
  (2, 2)	2.09861228866811
  (2, 3)	1.0
  (3, 20)	2.09861228866811
  (3, 8)	2.09861228866811
  (3, 18)	2.09861228866811
  (3, 6)	1.4054651081081644
  (3, 9)	1.6931471805599454
  (3, 3)	1.0
  (3, 16)	1.6931471805599454
  (3, 21)	2.3646431135879094
  (4, 23)	2.09861228866811
  (4, 24)	2.09861228866811
  (4, 7)	2.09861228866811
  (4, 13)	2.09861228866811
  (4, 6)	1.4054651081081644
  (4, 12)	1.4054651081081644
  (4, 3)	1.0
  (4, 21)	2.3646431135879094


In [None]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['are' 'area' 'bears' 'brown' 'cat' 'common' 'dog' 'down' 'faster' 'fox'
 'in' 'jumps' 'lazy' 'lies' 'on' 'over' 'quick' 'rug' 'runs' 'sleeps'
 'than' 'the' 'this' 'tree' 'under']


In [None]:
# Converting a sparse matrix to a dense matrix
dense = tfidfMat.todense()
denselist = dense.tolist()

# both matrices have majority elements zero. Sparse only saves non-zero elements whereas dense stores all elements including zeros

In [None]:
#Creating Pandas Dataframe of the feature names and there TFIDF values
df = pd.DataFrame(denselist,columns = feature_names)
df

Unnamed: 0,are,area,bears,brown,cat,common,dog,down,faster,fox,...,over,quick,rug,runs,sleeps,than,the,this,tree,under
0,0.0,0.0,0.0,1.0,0.0,0.0,1.405465,0.0,0.0,1.693147,...,2.098612,1.693147,0.0,0.0,0.0,0.0,2.364643,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,2.098612,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.098612,0.0,2.098612,0.0,2.364643,0.0,0.0,0.0
2,2.098612,2.098612,2.098612,1.0,0.0,2.098612,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.098612,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.405465,0.0,2.098612,1.693147,...,0.0,1.693147,0.0,2.098612,0.0,2.098612,2.364643,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.405465,2.098612,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.364643,0.0,2.098612,2.098612



Term frequency measures how frequently a term (word) occurs in a document.

TF(t, d) = (number of times t appears in d) / (total number of terms in d)

Inverse document frequency measures how important a term is across all documents in the corpus.

IDF(t) = log (N / 1+df)

TF - IDF(t, d) = TF(t, d) * IDF(t)