### Bag Of Words

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
document = ['One Geek helps Two Geeks',
            'Two Geeks help Four Geeks',
            'Each Geek helps many other Geeks at GeeksforGeeks']

# Create a Vectorizer Object
vectorizer = CountVectorizer()

vectorizer.fit(document)

# Printing the identified Unique words along with their indices
print('Vocabulary: ',vectorizer.vocabulary_)

# Encode the Document
vector = vectorizer.transform(document)

# Summarizing the encoded texts
print('Encoded Document is:')
print(vector.toarray())

Vocabulary:  {'one': 9, 'geek': 3, 'helps': 7, 'two': 11, 'geeks': 4, 'help': 6, 'four': 2, 'each': 1, 'many': 8, 'other': 10, 'at': 0, 'geeksforgeeks': 5}
Encoded Document is:
[[0 0 0 1 1 0 0 1 0 1 0 1]
 [0 0 1 0 2 0 1 0 0 0 0 1]
 [1 1 0 1 1 1 0 1 1 0 1 0]]


### TF-IDF (Term frequency - Inverse Document Frequency)

TF(t,d) = (Number of times t occur in d)/(Total number of words in d)

IDF(t,D) = log((Number of Documents in the corpus D)/(Number of documents with the word t))

Formula of IDF used by scikit-learn:

log((1 + N)/(1 + df(t))) + 1      
(Smoothening) 

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
d0 = 'Geeks for geeks'
d1 = 'Geeks'
d2 = 'r2j'

# merge the documents in a single corpus
string  = [d0,d1,d2]

Tfidf = TfidfVectorizer()
result = Tfidf.fit_transform(string)
result.toarray()

array([[0.54935123, 0.83559154, 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ]])

In [5]:
# get idf values
print('\nIDF values:')
for ele1, ele2 in zip(Tfidf.get_feature_names_out(),Tfidf.idf_):
    print(ele1, ':', ele2)


IDF values:
for : 1.6931471805599454
geeks : 1.2876820724517808
r2j : 1.6931471805599454


In [6]:
# get indexing
print('\nWord indexes:')
print(Tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray()) 



Word indexes:
{'geeks': 1, 'for': 0, 'r2j': 2}

tf-idf value:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (3, 3)>
  Coords	Values
  (0, 1)	0.8355915419449176
  (0, 0)	0.5493512310263033
  (1, 1)	1.0
  (2, 2)	1.0

tf-idf values in matrix form:
[[0.54935123 0.83559154 0.        ]
 [0.         1.         0.        ]
 [0.         0.         1.        ]]


In [None]:
# assign documents
d0 = 'Geeks for geeks!'
d1 = 'Geeks for geeks!'


# merge documents into a single corpus
string = [d0, d1]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf values:')
print(result)


print(tfidf.idf_)


Word indexes:
{'geeks': 1, 'for': 0}

tf-idf values:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (2, 2)>
  Coords	Values
  (0, 1)	0.8944271909999159
  (0, 0)	0.4472135954999579
  (1, 1)	0.8944271909999159
  (1, 0)	0.4472135954999579
[1. 1.]
