In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [5]:
#let's create the vectorizer and fit the corpus and transform them accordingly
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [6]:
#let's print the vocabulary

print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [7]:
#Various functions that we can implement
dir(v)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_params',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_tfidf',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',

In [8]:
#let's print the idf of each word:

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    
    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)
    
    #get the score
    idf_score = v.idf_[indx]
    
    print(f"{word} : {idf_score}")

already : 2.386294361119891
am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
ate : 2.386294361119891
biryani : 2.386294361119891
dot : 2.386294361119891
eating : 1.9808292530117262
eco : 2.386294361119891
google : 2.386294361119891
grapes : 2.386294361119891
iphone : 2.386294361119891
ironman : 2.386294361119891
is : 1.1335313926245225
loki : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pixel : 2.386294361119891
pizza : 2.386294361119891
surface : 2.386294361119891
tesla : 2.386294361119891
thor : 2.386294361119891
tomorrow : 1.2876820724517808
you : 2.386294361119891


In [9]:
#To print first two sentences
corpus[:2]

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [11]:
#let's print the transformed output from tf-idf
print(transform_output)

  (0, 25)	0.2426654728284301
  (0, 22)	0.7279964184852903
  (0, 17)	0.2426654728284301
  (0, 16)	0.11527032701364152
  (0, 15)	0.2426654728284301
  (0, 10)	0.40286636477562926
  (0, 7)	0.2426654728284301
  (0, 0)	0.2426654728284301
  (1, 26)	0.30652086071532464
  (1, 20)	0.30652086071532464
  (1, 16)	0.26982671076064085
  (1, 14)	0.5680354003049032
  (1, 5)	0.5680354003049032
  (1, 4)	0.30652086071532464
  (2, 26)	0.30652086071532464
  (2, 24)	0.5680354003049032
  (2, 20)	0.30652086071532464
  (2, 19)	0.5680354003049032
  (2, 16)	0.26982671076064085
  (2, 4)	0.30652086071532464
  (3, 26)	0.30652086071532464
  (3, 21)	0.5680354003049032
  (3, 20)	0.30652086071532464
  (3, 16)	0.26982671076064085
  (3, 12)	0.5680354003049032
  (3, 4)	0.30652086071532464
  (4, 26)	0.30652086071532464
  (4, 23)	0.5680354003049032
  (4, 20)	0.30652086071532464
  (4, 18)	0.5680354003049032
  (4, 16)	0.26982671076064085
  (4, 4)	0.30652086071532464
  (5, 26)	0.26652333217709795
  (5, 20)	0.26652333217709795
 

In [12]:
for sentence,feature in zip(transform_output,corpus):
    print(sentence)
    print(feature)

  (0, 25)	0.2426654728284301
  (0, 22)	0.7279964184852903
  (0, 17)	0.2426654728284301
  (0, 16)	0.11527032701364152
  (0, 15)	0.2426654728284301
  (0, 10)	0.40286636477562926
  (0, 7)	0.2426654728284301
  (0, 0)	0.2426654728284301
Thor eating pizza, Loki is eating pizza, Ironman ate pizza already
  (0, 26)	0.30652086071532464
  (0, 20)	0.30652086071532464
  (0, 16)	0.26982671076064085
  (0, 14)	0.5680354003049032
  (0, 5)	0.5680354003049032
  (0, 4)	0.30652086071532464
Apple is announcing new iphone tomorrow
  (0, 26)	0.30652086071532464
  (0, 24)	0.5680354003049032
  (0, 20)	0.30652086071532464
  (0, 19)	0.5680354003049032
  (0, 16)	0.26982671076064085
  (0, 4)	0.30652086071532464
Tesla is announcing new model-3 tomorrow
  (0, 26)	0.30652086071532464
  (0, 21)	0.5680354003049032
  (0, 20)	0.30652086071532464
  (0, 16)	0.26982671076064085
  (0, 12)	0.5680354003049032
  (0, 4)	0.30652086071532464
Google is announcing new pixel-6 tomorrow
  (0, 26)	0.30652086071532464
  (0, 23)	0.568035