In [1]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [2]:
import numpy as np
import math

In [17]:
unique_set = set()
for i in corpus:
    words = i.lower().split()
    unique_set.update(words)

unic = sorted(unique_set)
unic = unic[1:]
print(unic)

['and', 'are', 'bodies', 'celestial', 'is', 'moon', 'satellite', 'star', 'sun', 'the']


In [18]:
count_matrix = []

for sentence in corpus:
    words = sentence.lower().split()
    cou = []
    for j in unic:
        cou.append(words.count(j))
    count_matrix.append(cou)

print(count_matrix)


[[0, 0, 0, 0, 1, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 0, 0, 1], [1, 1, 1, 1, 0, 1, 0, 0, 1, 1]]


In [19]:
# calculating TF
tf_matrix = []

for i,row in enumerate(count_matrix):
    total_terms = sum(row)
    tf_row = [round(count/total_terms,4) for count in row]
    tf_matrix.append(tf_row)

# Print nicely
for i, row in enumerate(tf_matrix):
    print(f"TF sen {i+1}:", row)



TF sen 1: [0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.25, 0.25, 0.25]
TF sen 2: [0.0, 0.0, 0.0, 0.0, 0.25, 0.25, 0.25, 0.0, 0.0, 0.25]
TF sen 3: [0.1429, 0.1429, 0.1429, 0.1429, 0.0, 0.1429, 0.0, 0.0, 0.1429, 0.1429]


In [20]:
N = len(corpus)
idf_vector = []

# Transpose count_matrix to get columns (term-wise)
for term_index in range(len(unic)):
    df = sum(1 for doc in count_matrix if doc[term_index] > 0)
    
    idf = round(math.log10(N / (df)), 4)
    idf_vector.append(idf)

# Display results
print("IDF values:")
for term, idf_val in zip(unic, idf_vector):
    print(f"{term}: {idf_val}")

IDF values:
and: 0.4771
are: 0.4771
bodies: 0.4771
celestial: 0.4771
is: 0.1761
moon: 0.1761
satellite: 0.4771
star: 0.4771
sun: 0.1761
the: 0.0


In [24]:
idf = np.array(idf_vector)
tf = np.array(tf_matrix)
tfidf_matrix = tf * idf
print(tfidf_matrix)

[[0.         0.         0.         0.         0.044025   0.
  0.         0.119275   0.044025   0.        ]
 [0.         0.         0.         0.         0.044025   0.044025
  0.119275   0.         0.         0.        ]
 [0.06817759 0.06817759 0.06817759 0.06817759 0.         0.02516469
  0.         0.         0.02516469 0.        ]]


In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer()
X_count = cv.fit_transform(corpus).toarray()
print("CountVectorizer Matrix:\n", X_count)
print("Vocabulary:\n", cv.get_feature_names_out())

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus).toarray()
print("TfidfVectorizer Matrix:\n", X_tfidf)
print("Vocabulary:\n", tfidf.get_feature_names_out())


CountVectorizer Matrix:
 [[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]
Vocabulary:
 ['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
TfidfVectorizer Matrix:
 [[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]
Vocabulary:
 ['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']


In [25]:
tfidf_matrix

array([[0.        , 0.        , 0.        , 0.        , 0.044025  ,
        0.        , 0.        , 0.119275  , 0.044025  , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.044025  ,
        0.044025  , 0.119275  , 0.        , 0.        , 0.        ],
       [0.06817759, 0.06817759, 0.06817759, 0.06817759, 0.        ,
        0.02516469, 0.        , 0.        , 0.02516469, 0.        ]])

In [26]:
X_tfidf

array([[0.        , 0.        , 0.        , 0.        , 0.4804584 ,
        0.        , 0.        , 0.63174505, 0.4804584 , 0.37311881],
       [0.        , 0.        , 0.        , 0.        , 0.4804584 ,
        0.4804584 , 0.63174505, 0.        , 0.        , 0.37311881],
       [0.4261835 , 0.4261835 , 0.4261835 , 0.4261835 , 0.        ,
        0.32412354, 0.        , 0.        , 0.32412354, 0.25171084]])