In [1]:
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [2]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [3]:
docs = [doc.lower().split() for doc in corpus]
vocab = sorted(set(word for doc in docs for word in doc))

In [4]:
tf = []
for doc in docs:
    word_count = {word: doc.count(word) for word in vocab}
    doc_len = len(doc)
    tf.append({word: count / doc_len for word, count in word_count.items()})

In [6]:
df = {word: sum(word in doc for doc in docs) for word in vocab}

N = len(corpus)
idf = {word: math.log(N / df[word]) for word in vocab}

In [7]:
manual_tfidf = []
for doc_tf in tf:
    tfidf_doc = {word: doc_tf[word] * idf[word] for word in vocab}
    manual_tfidf.append(tfidf_doc)

manual_df = pd.DataFrame(manual_tfidf).round(3)
print("Manual TF-IDF:\n", manual_df)

Manual TF-IDF:
        a    and    are  bodies  celestial     is   moon  satellite  star  \
0  0.081  0.000  0.000   0.000      0.000  0.081  0.000       0.00  0.22   
1  0.081  0.000  0.000   0.000      0.000  0.081  0.081       0.22  0.00   
2  0.000  0.157  0.157   0.157      0.157  0.000  0.058       0.00  0.00   

     sun  the  
0  0.081  0.0  
1  0.000  0.0  
2  0.058  0.0  


In [8]:
count_vec = CountVectorizer()
count_matrix = count_vec.fit_transform(corpus)
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vec.get_feature_names_out())
print("\nCountVectorizer:\n", count_df)


CountVectorizer:
    and  are  bodies  celestial  is  moon  satellite  star  sun  the
0    0    0       0          0   1     0          0     1    1    1
1    0    0       0          0   1     1          1     0    0    1
2    1    1       1          1   0     1          0     0    1    1


In [9]:
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(corpus)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vec.get_feature_names_out()).round(3)
print("\nTfidfVectorizer:\n", tfidf_df)


TfidfVectorizer:
      and    are  bodies  celestial    is   moon  satellite   star    sun  \
0  0.000  0.000   0.000      0.000  0.48  0.000      0.000  0.632  0.480   
1  0.000  0.000   0.000      0.000  0.48  0.480      0.632  0.000  0.000   
2  0.426  0.426   0.426      0.426  0.00  0.324      0.000  0.000  0.324   

     the  
0  0.373  
1  0.373  
2  0.252  
