In [1]:
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [3]:
tokenized_corpus = [sentence.lower().split() for sentence in corpus]

vocab = sorted(set(word for sentence in tokenized_corpus for word in sentence))
print(f"Vocabulary: {vocab}")

Vocabulary: ['a', 'and', 'are', 'bodies', 'celestial', 'is', 'moon', 'satellite', 'star', 'sun', 'the']


In [12]:
tf = []
for sentence in tokenized_corpus:
    word_counts = {}
    total_words = len(sentence)
    for word in vocab:
        word_counts[word] = sentence.count(word) / total_words
    tf.append(word_counts)

print(tf)

[{'a': 0.2, 'and': 0.0, 'are': 0.0, 'bodies': 0.0, 'celestial': 0.0, 'is': 0.2, 'moon': 0.0, 'satellite': 0.0, 'star': 0.2, 'sun': 0.2, 'the': 0.2}, {'a': 0.2, 'and': 0.0, 'are': 0.0, 'bodies': 0.0, 'celestial': 0.0, 'is': 0.2, 'moon': 0.2, 'satellite': 0.2, 'star': 0.0, 'sun': 0.0, 'the': 0.2}, {'a': 0.0, 'and': 0.14285714285714285, 'are': 0.14285714285714285, 'bodies': 0.14285714285714285, 'celestial': 0.14285714285714285, 'is': 0.0, 'moon': 0.14285714285714285, 'satellite': 0.0, 'star': 0.0, 'sun': 0.14285714285714285, 'the': 0.14285714285714285}]


In [None]:
N = len(tokenized_corpus)
idf = {}

for word in vocab:
    df = sum(1 for sentence in tokenized_corpus if word in sentence)
    idf[word] = math.log((N) / (1 + df)) + 1

tfidf_manual = []
for doc_tf in tf:
    doc_tfidf = {}
    for word in vocab:
        doc_tfidf[word] = doc_tf[word] * idf[word]
    tfidf_manual.append(doc_tfidf)


print("\nManual TF-IDF Results:")
for i, doc_tfidf in enumerate(tfidf_manual):
    print(f"\nDocument {i+1}:")
    print(doc_tfidf)



Manual TF-IDF Results:

Document 1:
{'a': 0.2, 'and': 0.0, 'are': 0.0, 'bodies': 0.0, 'celestial': 0.0, 'is': 0.2, 'moon': 0.0, 'satellite': 0.0, 'star': 0.2810930216216329, 'sun': 0.2, 'the': 0.14246358550964383}

Document 2:
{'a': 0.2, 'and': 0.0, 'are': 0.0, 'bodies': 0.0, 'celestial': 0.0, 'is': 0.2, 'moon': 0.2, 'satellite': 0.2810930216216329, 'star': 0.0, 'sun': 0.0, 'the': 0.14246358550964383}

Document 3:
{'a': 0.0, 'and': 0.20078072972973776, 'are': 0.20078072972973776, 'bodies': 0.20078072972973776, 'celestial': 0.20078072972973776, 'is': 0.0, 'moon': 0.14285714285714285, 'satellite': 0.0, 'star': 0.0, 'sun': 0.14285714285714285, 'the': 0.10175970393545987}


In [10]:
cv = CountVectorizer()
cv_matrix = cv.fit_transform(corpus)
print("\nCountVectorizer Vocabulary:", cv.get_feature_names_out())
print("CountVectorizer Matrix:\n", cv_matrix.toarray())

tv = TfidfVectorizer()
tv_matrix = tv.fit_transform(corpus)
print("\nTfidfVectorizer Vocabulary:", tv.get_feature_names_out())
print("TfidfVectorizer Matrix:\n", tv_matrix.toarray())



CountVectorizer Vocabulary: ['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
CountVectorizer Matrix:
 [[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]

TfidfVectorizer Vocabulary: ['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
TfidfVectorizer Matrix:
 [[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]
