In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
 
# this is a very toy example, do not try this at home unless you want to understand the usage differences
docs = ["the house had a tiny little mouse",
      "the cat saw the mouse",
      "the mouse ran away from the house",
      "the cat finally ate the mouse",
      "the end of the mouse story"
     ]

## 1. Compute TF

In [2]:
# instantiate CountVectorizer()
cv = CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector = cv.fit_transform(docs)
#word_count_vector = cv.fit(docs)

In [3]:
print ('Fit Vectorizer to train set\n', word_count_vector.toarray())

Fit Vectorizer to train set
 [[0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 0 1 0 2 0]
 [0 1 0 0 0 1 0 1 0 1 0 1 0 0 2 0]
 [1 0 1 0 1 0 0 0 0 1 0 0 0 0 2 0]
 [0 0 0 1 0 0 0 0 0 1 1 0 0 1 2 0]]


Now, let’s check the shape. We should have 5 rows (5 docs) and 16 columns (16 unique words, minus single character words):

In [3]:
word_count_vector.shape

(5, 16)

In [4]:
cv.get_feature_names()

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'from',
 'had',
 'house',
 'little',
 'mouse',
 'of',
 'ran',
 'saw',
 'story',
 'the',
 'tiny']

In [5]:
#print(word_count_vector)
document_vector = word_count_vector[0]
document_vector.T.todense()

matrix([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1]], dtype=int64)

## 2. Compute IDF

In [6]:
# compute the IDF
idf = TfidfTransformer(smooth_idf=True, use_idf=True)
idf.fit(word_count_vector) # fit() compute only idf

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

#### <font color="red">fit()</font> = has not been transformed into a matrix per document, or produced a matrix with n x 1, n = number of terms
#### <font color="red">transform()</font> = has been transformed into a matrix per document, or produces a matrix with n x m, n = number of terms; m = number of documents

In [7]:
idf.idf_
#tfidf_transformer.shape
# method .shape() can't be run, because the document matrix is n x 1

array([2.09861229, 2.09861229, 1.69314718, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 1.69314718, 2.09861229, 1.        ,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 1.        ,
       2.09861229])

In [9]:
# print idf values
df_idf = pd.DataFrame(idf.idf_, index=cv.get_feature_names(), columns=["IDF"])
 
# sort ascending
df_idf.sort_values(by=["IDF"])

Unnamed: 0,IDF
mouse,1.0
the,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
from,2.098612
had,2.098612


#### Notice that the words ‘mouse’ and ‘the’ have the lowest IDF values. This is expected as these words appear in each and every document in our collection. <font color="red">The lower the IDF value of a word, the less unique it is to any particular document.</font>

## 3. Compute TF-IDF

In [13]:
# count matrix
word_count_vector_2 = cv.transform(docs)

# tf-idf scores
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_vector = tfidf.fit_transform(word_count_vector_2) # fit_transform() compute tfidf

#### By invoking .fit_transform() you will finally be computing the tf-idf scores for your docs. Internally this is computing the tf * idf  multiplication where your term frequency is weighted by its IDF values.

In [14]:
print(tfidf_vector)

  (0, 15)	0.4935620852501244
  (0, 14)	0.23518497814732847
  (0, 9)	0.23518497814732847
  (0, 8)	0.4935620852501244
  (0, 7)	0.39820278266020154
  (0, 6)	0.4935620852501244
  (1, 14)	0.5709412442157336
  (1, 12)	0.5990921556092994
  (1, 9)	0.2854706221078668
  (1, 2)	0.4833437789546282
  (2, 14)	0.4356144053674603
  (2, 11)	0.4570928721125019
  (2, 9)	0.21780720268373016
  (2, 7)	0.3687796511296063
  (2, 5)	0.4570928721125019
  (2, 1)	0.4570928721125019
  (3, 14)	0.4897741328791844
  (3, 9)	0.2448870664395922
  (3, 4)	0.5139230069660121
  (3, 2)	0.4146298460977916
  (3, 0)	0.5139230069660121
  (4, 14)	0.46864605709870183
  (4, 13)	0.4917531872315962
  (4, 10)	0.4917531872315962
  (4, 9)	0.23432302854935091
  (4, 3)	0.4917531872315962


In [19]:
#cv = CountVectorizer()
feature_names = cv.get_feature_names()
 
# get tfidf vector for first document
first_document_vector = tfidf_vector[0]
 
# print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["TF-IDF"])
df.sort_values(by=["TF-IDF"],ascending=False)

Unnamed: 0,TF-IDF
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


#### Notice that only certain words have scores. This is because our first document is “the house had a tiny little mouse”  all the words in this document have a TF-IDF score and everything else show up as zeroes. Notice that the word “a” is missing from this list. This is possibly due to internal pre-processing of CountVectorizer where it removes single characters.

#### The scores above make sense. The more common the word across documents, the lower its score and the more unique a word is to our first document (e.g. ‘had’ and ‘tiny’) the higher the score. So it’s working as expected except for the mysterious a that was chopped off.

## 4. Tfidfvectorizer Usage

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)

In [21]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["TFIDF"])
df.sort_values(by=["TFIDF"],ascending=False)

Unnamed: 0,TFIDF
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


#### Here’s another way to do it by calling fit() and transform() separately and you’ll end up with the same results.

In [22]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(docs)
tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)

## Tfidftransformer vs. Tfidfvectorizer

In summary, the main difference between the two modules are as follows:

With Tfidftransformer() you will systematically compute word counts using CountVectorizer() and then compute the Inverse Document Frequency (IDF) values and only then compute the Tf-idf scores.

With Tfidfvectorizer() on the contrary, you will do all three steps at once. Under the hood, it computes the word counts, IDF values, and Tf-idf scores all using the same dataset.

## When to use what?

So now you may be wondering, why you should use more steps than necessary if you can get everything done in two steps. Well, there are cases where you want to use Tfidftransformer over Tfidfvectorizer and it is sometimes not that obvious. Here is a general guideline:

- If you need the term frequency (term count) vectors for different tasks, use Tfidftransformer.</br>
- If you need to compute tf-idf scores on documents within your “training” dataset, use Tfidfvectorizer.</br>
- If you need to compute tf-idf scores on documents outside your “training” dataset, use either one, both will work.