In [1]:
# Imports
import math
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Corpus
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [2]:
# Step 1: Preprocess (tokenize + lowercase)
tokenized_corpus = [doc.lower().split() for doc in corpus]
vocab = sorted(set(word for doc in tokenized_corpus for word in doc))

# Step 2: Term Frequency (TF)
def compute_tf(doc, vocab):
    tf_dict = {}
    total_terms = len(doc)
    for word in vocab:
        tf_dict[word] = doc.count(word) / total_terms
    return tf_dict

tf_matrix = [compute_tf(doc, vocab) for doc in tokenized_corpus]

In [3]:
# Step 3: Inverse Document Frequency (IDF)
def compute_idf(docs, vocab):
    idf_dict = {}
    N = len(docs)
    for word in vocab:
        df = sum(word in doc for doc in docs)
        idf_dict[word] = math.log((N + 1) / (df + 1)) + 1  # Smoothing
    return idf_dict

idf_values = compute_idf(tokenized_corpus, vocab)

# Step 4: TF-IDF
def compute_tfidf(tf_doc, idf_dict):
    return {word: tf_doc[word] * idf_dict[word] for word in tf_doc}

tfidf_matrix_manual = [compute_tfidf(tf, idf_values) for tf in tf_matrix]

In [6]:
# Step 5: Convert manual TF-IDF to DataFrame
tfidf_df_manual = pd.DataFrame(tfidf_matrix_manual)
print("🔧 Manual TF-IDF Matrix")
display(tfidf_df_manual)

🔧 Manual TF-IDF Matrix


Unnamed: 0,a,and,are,bodies,celestial,is,moon,satellite,star,sun,the
0,0.257536,0.0,0.0,0.0,0.0,0.257536,0.0,0.0,0.338629,0.257536,0.2
1,0.257536,0.0,0.0,0.0,0.0,0.257536,0.257536,0.338629,0.0,0.0,0.2
2,0.0,0.241878,0.241878,0.241878,0.241878,0.0,0.183955,0.0,0.0,0.183955,0.142857


In [7]:
# Step 6: Scikit-learn CountVectorizer
cv = CountVectorizer()
X_count = cv.fit_transform(corpus)
count_df = pd.DataFrame(X_count.toarray(), columns=cv.get_feature_names_out())
print("📊 CountVectorizer Matrix")
display(count_df)

📊 CountVectorizer Matrix


Unnamed: 0,and,are,bodies,celestial,is,moon,satellite,star,sun,the
0,0,0,0,0,1,0,0,1,1,1
1,0,0,0,0,1,1,1,0,0,1
2,1,1,1,1,0,1,0,0,1,1


In [5]:
# Step 7: Scikit-learn TfidfVectorizer
tv = TfidfVectorizer()
X_tfidf = tv.fit_transform(corpus)
tfidf_df_sklearn = pd.DataFrame(X_tfidf.toarray(), columns=tv.get_feature_names_out())
print("🤖 Sklearn TF-IDF Matrix")
display(tfidf_df_sklearn)

🤖 Sklearn TF-IDF Matrix


Unnamed: 0,and,are,bodies,celestial,is,moon,satellite,star,sun,the
0,0.0,0.0,0.0,0.0,0.480458,0.0,0.0,0.631745,0.480458,0.373119
1,0.0,0.0,0.0,0.0,0.480458,0.480458,0.631745,0.0,0.0,0.373119
2,0.426184,0.426184,0.426184,0.426184,0.0,0.324124,0.0,0.0,0.324124,0.251711
