In [None]:
import math
import numpy as np
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [None]:
words = [word for sent in corpus for word in sent.split()]
words = list(set(words))
print(words)

['star', 'are', 'the', 'celestial', 'bodies', 'sun', 'moon', 'a', 'and', 'is', 'satellite']


In [None]:
term_freq = []
df = {}
for sent in corpus:
  wor = sent.split()
  sent_term_freq = {}
  for word in words:
    count = wor.count(word)
    sent_term_freq[word]= count/len(wor)
    if count > 0:
      df[word] = df.get(word, 0) + 1
  term_freq.append(sent_term_freq)
print(term_freq)


[{'star': 0.2, 'are': 0.0, 'the': 0.2, 'celestial': 0.0, 'bodies': 0.0, 'sun': 0.2, 'moon': 0.0, 'a': 0.2, 'and': 0.0, 'is': 0.2, 'satellite': 0.0}, {'star': 0.0, 'are': 0.0, 'the': 0.2, 'celestial': 0.0, 'bodies': 0.0, 'sun': 0.0, 'moon': 0.2, 'a': 0.2, 'and': 0.0, 'is': 0.2, 'satellite': 0.2}, {'star': 0.0, 'are': 0.14285714285714285, 'the': 0.14285714285714285, 'celestial': 0.14285714285714285, 'bodies': 0.14285714285714285, 'sun': 0.14285714285714285, 'moon': 0.14285714285714285, 'a': 0.0, 'and': 0.14285714285714285, 'is': 0.0, 'satellite': 0.0}]


In [None]:
print(df)

{'star': 1, 'the': 3, 'sun': 2, 'a': 2, 'is': 2, 'moon': 2, 'satellite': 1, 'are': 1, 'celestial': 1, 'bodies': 1, 'and': 1}


In [None]:
tf_idf = []
for sent in term_freq:
  sent_tf_idf = {}
  for word, value in sent.items():
    idf = math.log(len(corpus)/df[word])
    sent_tf_idf[word] = value * idf
  tf_idf.append(sent_tf_idf)
print(tf_idf)

[{'star': 0.21972245773362198, 'are': 0.0, 'the': 0.0, 'celestial': 0.0, 'bodies': 0.0, 'sun': 0.08109302162163289, 'moon': 0.0, 'a': 0.08109302162163289, 'and': 0.0, 'is': 0.08109302162163289, 'satellite': 0.0}, {'star': 0.0, 'are': 0.0, 'the': 0.0, 'celestial': 0.0, 'bodies': 0.0, 'sun': 0.0, 'moon': 0.08109302162163289, 'a': 0.08109302162163289, 'and': 0.0, 'is': 0.08109302162163289, 'satellite': 0.21972245773362198}, {'star': 0.0, 'are': 0.15694461266687282, 'the': 0.0, 'celestial': 0.15694461266687282, 'bodies': 0.15694461266687282, 'sun': 0.05792358687259491, 'moon': 0.05792358687259491, 'a': 0.0, 'and': 0.15694461266687282, 'is': 0.0, 'satellite': 0.0}]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
tfidf_matrix = X.toarray()
df_sklearn = pd.DataFrame(tfidf_matrix, columns=feature_names)
print("\nTF-IDF (Scikit-learn):")
print(df_sklearn.round(3))



TF-IDF (Scikit-learn):
     and    are  bodies  celestial    is   moon  satellite   star    sun  \
0  0.000  0.000   0.000      0.000  0.48  0.000      0.000  0.632  0.480   
1  0.000  0.000   0.000      0.000  0.48  0.480      0.632  0.000  0.000   
2  0.426  0.426   0.426      0.426  0.00  0.324      0.000  0.000  0.324   

     the  
0  0.373  
1  0.373  
2  0.252  


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print(X.toarray())


['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]
