In [2]:
import pandas as pd
import numpy as np

corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science coures',
          'data scientists analyze data' ]

words_set = set()
for doc in corpus:
    words = doc.split()
    words_set = words_set.union(set(words))

print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

n_docs = len(corpus)
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=list(words_set))

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = [word for word in corpus[i].split() if word]
    for w in words:
        df_tf.loc[i, w] += 1/len(words) # Use .loc for clearer indexing

# Dataframe shows the frequency of each word in each document,
# a column for each word and a row for each document.
print("\nTerm Frequency (TF) DataFrame:")
display(df_tf) # Use display for better formatting

print("\nIDF of:")
idf = {}
for w in words_set:
    k = 0
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
    if k > 0: # Avoid division by zero
        idf[w] = np.log10(n_docs/k)
        print(f'{w:>15}: {idf[w]:>10.4f}') # Format IDF to 4 decimal places

df_tf_idf = df_tf.copy()
for w in words_set:
    if w in idf: # Check if word exists in idf dictionary
        df_tf_idf[w] = df_tf[w] * idf[w]

print("\nTF-IDF DataFrame:")
display(df_tf_idf) # Use display for better formatting

Number of words in the corpus: 14
The words in the corpus: 
 {'is', 'important', 'of', 'most', 'this', 'best', 'one', 'science', 'analyze', 'scientists', 'the', 'fields', 'data', 'coures'}

Term Frequency (TF) DataFrame:


Unnamed: 0,is,important,of,most,this,best,one,science,analyze,scientists,the,fields,data,coures
0,0.090909,0.090909,0.181818,0.090909,0.0,0.0,0.090909,0.181818,0.0,0.0,0.090909,0.090909,0.090909,0.0
1,0.111111,0.0,0.111111,0.0,0.111111,0.111111,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.111111,0.111111
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.5,0.0



IDF of:
             is:     0.1761
      important:     0.4771
             of:     0.1761
           most:     0.4771
           this:     0.4771
           best:     0.4771
            one:     0.1761
        science:     0.1761
        analyze:     0.4771
     scientists:     0.4771
            the:     0.1761
         fields:     0.4771
           data:     0.0000
         coures:     0.4771

TF-IDF DataFrame:


Unnamed: 0,is,important,of,most,this,best,one,science,analyze,scientists,the,fields,data,coures
0,0.016008,0.043375,0.032017,0.043375,0.0,0.0,0.016008,0.032017,0.0,0.0,0.016008,0.043375,0.0,0.0
1,0.019566,0.0,0.019566,0.0,0.053013,0.053013,0.019566,0.019566,0.0,0.0,0.019566,0.0,0.0,0.053013
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11928,0.11928,0.0,0.0,0.0,0.0
