#### Count Vectorizer

In [2]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

# Creating the vocabulary
vocabulary = set()

for documents in corpus:
    words = documents.lower().split()
    for word in words:
        vocabulary.add(word)

vocabulary = list(vocabulary)
word_to_index = {word:i for i,word in enumerate(vocabulary)}

def Count_vectorizer(corpus,vocabulary,word_to_index):
    vectorization = list()
    for sentence in corpus:
        vector = [0] * len(vocabulary)
        words = sentence.lower().split()
        for word in words:
            vector[word_to_index[word]] += 1
        vectorization.append(vector)
    
    return vectorization
        
print(word_to_index)
print("Vectorization of the Corpus")
Count_vectorizer(corpus,vocabulary,word_to_index)

{'a': 0, 'star': 1, 'is': 2, 'moon': 3, 'and': 4, 'celestial': 5, 'the': 6, 'bodies': 7, 'satellite': 8, 'are': 9, 'sun': 10}
Vectorization of the Corpus


[[1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1],
 [1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
 [0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1]]

#### Count Vectorizer using Sklearn

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

Vectorizer = CountVectorizer()
X = Vectorizer.fit_transform(corpus)
print(Vectorizer.get_feature_names_out())
print(X.toarray())

['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]


##### TF-IDF

In [4]:
print("TERM FREQUENCY\n")
def Term_Frequency(word,sentence):
    words = sentence.lower().split()
    freq_dict = {}
    for word in words:
        freq_dict[word] = freq_dict.get(word,0) + 1
    return freq_dict[word]

for doc_index,document in enumerate(corpus):
    print(f"Document {doc_index + 1} :")
    words = document.lower().split()
    for word in words:
        print(f"{word} : {Term_Frequency(word,document)}")
    print("\n")

TERM FREQUENCY

Document 1 :
the : 1
sun : 1
is : 1
a : 1
star : 1


Document 2 :
the : 1
moon : 1
is : 1
a : 1
satellite : 1


Document 3 :
the : 1
sun : 1
and : 1
moon : 1
are : 1
celestial : 1
bodies : 1




In [5]:
import math
def Inverse_Doc_Frequecy(word,corpus):
    count = 0
    total = len(corpus)
    for document in corpus:
        words = document.lower().split()
        if word in words:
            count += 1
    return math.log10((total)/(count))

IDF = {}
for document in corpus:
    words = document.lower().split()
    for word in words:
        IDF[word] = IDF.get(word,0) + Inverse_Doc_Frequecy(word,corpus)
print("Inverse Document Frequency")
IDF

Inverse Document Frequency


{'the': 0.0,
 'sun': 0.3521825181113625,
 'is': 0.3521825181113625,
 'a': 0.3521825181113625,
 'star': 0.47712125471966244,
 'moon': 0.3521825181113625,
 'satellite': 0.47712125471966244,
 'and': 0.47712125471966244,
 'are': 0.47712125471966244,
 'celestial': 0.47712125471966244,
 'bodies': 0.47712125471966244}

In [6]:
print("TF-IDF SCORE")
for doc_index,document in enumerate(corpus):
    print(f"Document {doc_index + 1} :")
    words = document.lower().split()
    for word in words:
        print(f"{word} : {Term_Frequency(word,document)*IDF[word]}")
    print("\n")

TF-IDF SCORE
Document 1 :
the : 0.0
sun : 0.3521825181113625
is : 0.3521825181113625
a : 0.3521825181113625
star : 0.47712125471966244


Document 2 :
the : 0.0
moon : 0.3521825181113625
is : 0.3521825181113625
a : 0.3521825181113625
satellite : 0.47712125471966244


Document 3 :
the : 0.0
sun : 0.3521825181113625
and : 0.47712125471966244
moon : 0.3521825181113625
are : 0.47712125471966244
celestial : 0.47712125471966244
bodies : 0.47712125471966244




#### TF-IDF using Sklearn

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b') # To include 1 char tokens
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = {}

for doc_index in range(len(corpus)):
    row = tfidf_matrix[doc_index]
    indices = row.indices
    data = row.data
    tfidf_values[doc_index] = {feature_names[i]: round(score,4) for i,score in zip(indices,data)}

for doc_index, values in tfidf_values.items():
    print(f"Document {doc_index+1}:")
    for word, tfidf_value in values.items():
        print(f"  {word}: {tfidf_value}")
    print()

Document 1:
  the: 0.3363
  sun: 0.4331
  is: 0.4331
  a: 0.4331
  star: 0.5694

Document 2:
  the: 0.3363
  is: 0.4331
  a: 0.4331
  moon: 0.4331
  satellite: 0.5694

Document 3:
  the: 0.2517
  sun: 0.3241
  moon: 0.3241
  and: 0.4262
  are: 0.4262
  celestial: 0.4262
  bodies: 0.4262

